/**
  * Construct a scaling executor builder; the settings will have the specified key prefix.
  *
  * @param name the name of the executor
  * @param core the minimum number of threads in the pool
  * @param max the maximum number of threads in the pool
  * @param keepAlive the time that spare threads above {@code core} threads will be kept alive
  * @param prefix the prefix for the settings keys
  */
 public ScalingExecutorBuilder(
     final String name,
     final int core,
     final int max,
     final TimeValue keepAlive,
     final String prefix) {
   super(name);
   this.coreSetting =
       Setting.intSetting(settingsKey(prefix, "core"), core, Setting.Property.NodeScope);
   this.maxSetting =
       Setting.intSetting(settingsKey(prefix, "max"), max, Setting.Property.NodeScope);
   this.keepAliveSetting =
       Setting.timeSetting(
           settingsKey(prefix, "keep_alive"), keepAlive, Setting.Property.NodeScope);
 }
/**
 * Similar to the {@link ClusterRebalanceAllocationDecider} this {@link AllocationDecider} controls
 * the number of currently in-progress re-balance (relocation) operations and restricts node
 * allocations if the configured threshold is reached. The default number of concurrent rebalance
 * operations is set to <tt>2</tt>
 *
 * <p>Re-balance operations can be controlled in real-time via the cluster update API using
 * <tt>cluster.routing.allocation.cluster_concurrent_rebalance</tt>. Iff this setting is set to
 * <tt>-1</tt> the number of concurrent re-balance operations are unlimited.
 */
public class ConcurrentRebalanceAllocationDecider extends AllocationDecider {

  public static final String NAME = "concurrent_rebalance";

  public static final Setting<Integer>
      CLUSTER_ROUTING_ALLOCATION_CLUSTER_CONCURRENT_REBALANCE_SETTING =
          Setting.intSetting(
              "cluster.routing.allocation.cluster_concurrent_rebalance",
              2,
              -1,
              Property.Dynamic,
              Property.NodeScope);
  private volatile int clusterConcurrentRebalance;

  public ConcurrentRebalanceAllocationDecider(Settings settings, ClusterSettings clusterSettings) {
    super(settings);
    this.clusterConcurrentRebalance =
        CLUSTER_ROUTING_ALLOCATION_CLUSTER_CONCURRENT_REBALANCE_SETTING.get(settings);
    logger.debug("using [cluster_concurrent_rebalance] with [{}]", clusterConcurrentRebalance);
    clusterSettings.addSettingsUpdateConsumer(
        CLUSTER_ROUTING_ALLOCATION_CLUSTER_CONCURRENT_REBALANCE_SETTING,
        this::setClusterConcurrentRebalance);
  }

  private void setClusterConcurrentRebalance(int concurrentRebalance) {
    clusterConcurrentRebalance = concurrentRebalance;
  }

  @Override
  public Decision canRebalance(ShardRouting shardRouting, RoutingAllocation allocation) {
    if (clusterConcurrentRebalance == -1) {
      return allocation.decision(Decision.YES, NAME, "unlimited concurrent rebalances are allowed");
    }
    int relocatingShards = allocation.routingNodes().getRelocatingShardCount();
    if (relocatingShards >= clusterConcurrentRebalance) {
      return allocation.decision(
          Decision.NO,
          NAME,
          "too many shards are concurrently rebalancing [%d], limit: [%d]",
          relocatingShards,
          clusterConcurrentRebalance);
    }
    return allocation.decision(
        Decision.YES,
        NAME,
        "below threshold [%d] for concurrent rebalances, current rebalance shard count [%d]",
        clusterConcurrentRebalance,
        relocatingShards);
  }
}
예제 #3
0
public class EsExecutors {

  /**
   * Settings key to manually set the number of available processors. This is used to adjust thread
   * pools sizes etc. per node.
   */
  public static final Setting<Integer> PROCESSORS_SETTING =
      Setting.intSetting(
          "processors", Runtime.getRuntime().availableProcessors(), 1, Property.NodeScope);

  /**
   * Returns the number of available processors. Defaults to {@link Runtime#availableProcessors()}
   * but can be overridden by passing a {@link Settings} instance with the key "processors" set to
   * the desired value.
   *
   * @param settings a {@link Settings} instance from which to derive the available processors
   * @return the number of available processors
   */
  public static int numberOfProcessors(final Settings settings) {
    return PROCESSORS_SETTING.get(settings);
  }

  public static PrioritizedEsThreadPoolExecutor newSinglePrioritizing(
      String name, ThreadFactory threadFactory, ThreadContext contextHolder) {
    return new PrioritizedEsThreadPoolExecutor(
        name, 1, 1, 0L, TimeUnit.MILLISECONDS, threadFactory, contextHolder);
  }

  public static EsThreadPoolExecutor newScaling(
      String name,
      int min,
      int max,
      long keepAliveTime,
      TimeUnit unit,
      ThreadFactory threadFactory,
      ThreadContext contextHolder) {
    ExecutorScalingQueue<Runnable> queue = new ExecutorScalingQueue<>();
    EsThreadPoolExecutor executor =
        new EsThreadPoolExecutor(
            name,
            min,
            max,
            keepAliveTime,
            unit,
            queue,
            threadFactory,
            new ForceQueuePolicy(),
            contextHolder);
    queue.executor = executor;
    return executor;
  }

  public static EsThreadPoolExecutor newFixed(
      String name,
      int size,
      int queueCapacity,
      ThreadFactory threadFactory,
      ThreadContext contextHolder) {
    BlockingQueue<Runnable> queue;
    if (queueCapacity < 0) {
      queue = ConcurrentCollections.newBlockingQueue();
    } else {
      queue =
          new SizeBlockingQueue<>(
              ConcurrentCollections.<Runnable>newBlockingQueue(), queueCapacity);
    }
    return new EsThreadPoolExecutor(
        name,
        size,
        size,
        0,
        TimeUnit.MILLISECONDS,
        queue,
        threadFactory,
        new EsAbortPolicy(),
        contextHolder);
  }

  public static String threadName(Settings settings, String... names) {
    String namePrefix =
        Arrays.stream(names)
            .filter(name -> name != null)
            .collect(Collectors.joining(".", "[", "]"));
    return threadName(settings, namePrefix);
  }

  public static String threadName(Settings settings, String namePrefix) {
    if (Node.NODE_NAME_SETTING.exists(settings)) {
      return threadName(Node.NODE_NAME_SETTING.get(settings), namePrefix);
    } else {
      return threadName("", namePrefix);
    }
  }

  public static String threadName(final String nodeName, final String namePrefix) {
    return "elasticsearch"
        + (nodeName.isEmpty() ? "" : "[")
        + nodeName
        + (nodeName.isEmpty() ? "" : "]")
        + "["
        + namePrefix
        + "]";
  }

  public static ThreadFactory daemonThreadFactory(Settings settings, String namePrefix) {
    return daemonThreadFactory(threadName(settings, namePrefix));
  }

  public static ThreadFactory daemonThreadFactory(Settings settings, String... names) {
    return daemonThreadFactory(threadName(settings, names));
  }

  public static ThreadFactory daemonThreadFactory(String namePrefix) {
    return new EsThreadFactory(namePrefix);
  }

  static class EsThreadFactory implements ThreadFactory {

    final ThreadGroup group;
    final AtomicInteger threadNumber = new AtomicInteger(1);
    final String namePrefix;

    public EsThreadFactory(String namePrefix) {
      this.namePrefix = namePrefix;
      SecurityManager s = System.getSecurityManager();
      group = (s != null) ? s.getThreadGroup() : Thread.currentThread().getThreadGroup();
    }

    @Override
    public Thread newThread(Runnable r) {
      Thread t = new Thread(group, r, namePrefix + "[T#" + threadNumber.getAndIncrement() + "]", 0);
      t.setDaemon(true);
      return t;
    }
  }

  /** Cannot instantiate. */
  private EsExecutors() {}

  static class ExecutorScalingQueue<E> extends LinkedTransferQueue<E> {

    ThreadPoolExecutor executor;

    public ExecutorScalingQueue() {}

    @Override
    public boolean offer(E e) {
      // first try to transfer to a waiting worker thread
      if (!tryTransfer(e)) {
        // check if there might be spare capacity in the thread
        // pool executor
        int left = executor.getMaximumPoolSize() - executor.getCorePoolSize();
        if (left > 0) {
          // reject queuing the task to force the thread pool
          // executor to add a worker if it can; combined
          // with ForceQueuePolicy, this causes the thread
          // pool to always scale up to max pool size and we
          // only queue when there is no spare capacity
          return false;
        } else {
          return super.offer(e);
        }
      } else {
        return true;
      }
    }
  }

  /**
   * A handler for rejected tasks that adds the specified element to this queue, waiting if
   * necessary for space to become available.
   */
  static class ForceQueuePolicy implements XRejectedExecutionHandler {
    @Override
    public void rejectedExecution(Runnable r, ThreadPoolExecutor executor) {
      try {
        executor.getQueue().put(r);
      } catch (InterruptedException e) {
        // should never happen since we never wait
        throw new EsRejectedExecutionException(e);
      }
    }

    @Override
    public long rejected() {
      return 0;
    }
  }
}
예제 #4
0
public class GatewayService extends AbstractLifecycleComponent implements ClusterStateListener {

  public static final Setting<Integer> EXPECTED_NODES_SETTING =
      Setting.intSetting("gateway.expected_nodes", -1, -1, Property.NodeScope);
  public static final Setting<Integer> EXPECTED_DATA_NODES_SETTING =
      Setting.intSetting("gateway.expected_data_nodes", -1, -1, Property.NodeScope);
  public static final Setting<Integer> EXPECTED_MASTER_NODES_SETTING =
      Setting.intSetting("gateway.expected_master_nodes", -1, -1, Property.NodeScope);
  public static final Setting<TimeValue> RECOVER_AFTER_TIME_SETTING =
      Setting.positiveTimeSetting(
          "gateway.recover_after_time", TimeValue.timeValueMillis(0), Property.NodeScope);
  public static final Setting<Integer> RECOVER_AFTER_NODES_SETTING =
      Setting.intSetting("gateway.recover_after_nodes", -1, -1, Property.NodeScope);
  public static final Setting<Integer> RECOVER_AFTER_DATA_NODES_SETTING =
      Setting.intSetting("gateway.recover_after_data_nodes", -1, -1, Property.NodeScope);
  public static final Setting<Integer> RECOVER_AFTER_MASTER_NODES_SETTING =
      Setting.intSetting("gateway.recover_after_master_nodes", 0, 0, Property.NodeScope);

  public static final ClusterBlock STATE_NOT_RECOVERED_BLOCK =
      new ClusterBlock(
          1,
          "state not recovered / initialized",
          true,
          true,
          RestStatus.SERVICE_UNAVAILABLE,
          ClusterBlockLevel.ALL);

  public static final TimeValue DEFAULT_RECOVER_AFTER_TIME_IF_EXPECTED_NODES_IS_SET =
      TimeValue.timeValueMinutes(5);

  private final Gateway gateway;

  private final ThreadPool threadPool;

  private final AllocationService allocationService;

  private final ClusterService clusterService;

  private final TimeValue recoverAfterTime;
  private final int recoverAfterNodes;
  private final int expectedNodes;
  private final int recoverAfterDataNodes;
  private final int expectedDataNodes;
  private final int recoverAfterMasterNodes;
  private final int expectedMasterNodes;

  private final AtomicBoolean recovered = new AtomicBoolean();
  private final AtomicBoolean scheduledRecovery = new AtomicBoolean();

  @Inject
  public GatewayService(
      Settings settings,
      AllocationService allocationService,
      ClusterService clusterService,
      ThreadPool threadPool,
      GatewayMetaState metaState,
      TransportNodesListGatewayMetaState listGatewayMetaState,
      Discovery discovery,
      IndicesService indicesService) {
    super(settings);
    this.gateway =
        new Gateway(
            settings, clusterService, metaState, listGatewayMetaState, discovery, indicesService);
    this.allocationService = allocationService;
    this.clusterService = clusterService;
    this.threadPool = threadPool;
    // allow to control a delay of when indices will get created
    this.expectedNodes = EXPECTED_NODES_SETTING.get(this.settings);
    this.expectedDataNodes = EXPECTED_DATA_NODES_SETTING.get(this.settings);
    this.expectedMasterNodes = EXPECTED_MASTER_NODES_SETTING.get(this.settings);

    if (RECOVER_AFTER_TIME_SETTING.exists(this.settings)) {
      recoverAfterTime = RECOVER_AFTER_TIME_SETTING.get(this.settings);
    } else if (expectedNodes >= 0 || expectedDataNodes >= 0 || expectedMasterNodes >= 0) {
      recoverAfterTime = DEFAULT_RECOVER_AFTER_TIME_IF_EXPECTED_NODES_IS_SET;
    } else {
      recoverAfterTime = null;
    }
    this.recoverAfterNodes = RECOVER_AFTER_NODES_SETTING.get(this.settings);
    this.recoverAfterDataNodes = RECOVER_AFTER_DATA_NODES_SETTING.get(this.settings);
    // default the recover after master nodes to the minimum master nodes in the discovery
    if (RECOVER_AFTER_MASTER_NODES_SETTING.exists(this.settings)) {
      recoverAfterMasterNodes = RECOVER_AFTER_MASTER_NODES_SETTING.get(this.settings);
    } else {
      // TODO: change me once the minimum_master_nodes is changed too
      recoverAfterMasterNodes = settings.getAsInt("discovery.zen.minimum_master_nodes", -1);
    }

    // Add the not recovered as initial state block, we don't allow anything until
    this.clusterService.addInitialStateBlock(STATE_NOT_RECOVERED_BLOCK);
  }

  @Override
  protected void doStart() {
    // use post applied so that the state will be visible to the background recovery thread we spawn
    // in performStateRecovery
    clusterService.addListener(this);
  }

  @Override
  protected void doStop() {
    clusterService.removeListener(this);
  }

  @Override
  protected void doClose() {}

  @Override
  public void clusterChanged(final ClusterChangedEvent event) {
    if (lifecycle.stoppedOrClosed()) {
      return;
    }

    final ClusterState state = event.state();

    if (state.nodes().isLocalNodeElectedMaster() == false) {
      // not our job to recover
      return;
    }
    if (state.blocks().hasGlobalBlock(STATE_NOT_RECOVERED_BLOCK) == false) {
      // already recovered
      return;
    }

    DiscoveryNodes nodes = state.nodes();
    if (state.nodes().getMasterNodeId() == null) {
      logger.debug("not recovering from gateway, no master elected yet");
    } else if (recoverAfterNodes != -1
        && (nodes.getMasterAndDataNodes().size()) < recoverAfterNodes) {
      logger.debug(
          "not recovering from gateway, nodes_size (data+master) [{}] < recover_after_nodes [{}]",
          nodes.getMasterAndDataNodes().size(),
          recoverAfterNodes);
    } else if (recoverAfterDataNodes != -1 && nodes.getDataNodes().size() < recoverAfterDataNodes) {
      logger.debug(
          "not recovering from gateway, nodes_size (data) [{}] < recover_after_data_nodes [{}]",
          nodes.getDataNodes().size(),
          recoverAfterDataNodes);
    } else if (recoverAfterMasterNodes != -1
        && nodes.getMasterNodes().size() < recoverAfterMasterNodes) {
      logger.debug(
          "not recovering from gateway, nodes_size (master) [{}] < recover_after_master_nodes [{}]",
          nodes.getMasterNodes().size(),
          recoverAfterMasterNodes);
    } else {
      boolean enforceRecoverAfterTime;
      String reason;
      if (expectedNodes == -1 && expectedMasterNodes == -1 && expectedDataNodes == -1) {
        // no expected is set, honor the setting if they are there
        enforceRecoverAfterTime = true;
        reason = "recover_after_time was set to [" + recoverAfterTime + "]";
      } else {
        // one of the expected is set, see if all of them meet the need, and ignore the timeout in
        // this case
        enforceRecoverAfterTime = false;
        reason = "";
        if (expectedNodes != -1
            && (nodes.getMasterAndDataNodes().size()
                < expectedNodes)) { // does not meet the expected...
          enforceRecoverAfterTime = true;
          reason =
              "expecting ["
                  + expectedNodes
                  + "] nodes, but only have ["
                  + nodes.getMasterAndDataNodes().size()
                  + "]";
        } else if (expectedDataNodes != -1
            && (nodes.getDataNodes().size() < expectedDataNodes)) { // does not meet the expected...
          enforceRecoverAfterTime = true;
          reason =
              "expecting ["
                  + expectedDataNodes
                  + "] data nodes, but only have ["
                  + nodes.getDataNodes().size()
                  + "]";
        } else if (expectedMasterNodes != -1
            && (nodes.getMasterNodes().size()
                < expectedMasterNodes)) { // does not meet the expected...
          enforceRecoverAfterTime = true;
          reason =
              "expecting ["
                  + expectedMasterNodes
                  + "] master nodes, but only have ["
                  + nodes.getMasterNodes().size()
                  + "]";
        }
      }
      performStateRecovery(enforceRecoverAfterTime, reason);
    }
  }

  private void performStateRecovery(boolean enforceRecoverAfterTime, String reason) {
    final Gateway.GatewayStateRecoveredListener recoveryListener = new GatewayRecoveryListener();

    if (enforceRecoverAfterTime && recoverAfterTime != null) {
      if (scheduledRecovery.compareAndSet(false, true)) {
        logger.info("delaying initial state recovery for [{}]. {}", recoverAfterTime, reason);
        threadPool.schedule(
            recoverAfterTime,
            ThreadPool.Names.GENERIC,
            () -> {
              if (recovered.compareAndSet(false, true)) {
                logger.info(
                    "recover_after_time [{}] elapsed. performing state recovery...",
                    recoverAfterTime);
                gateway.performStateRecovery(recoveryListener);
              }
            });
      }
    } else {
      if (recovered.compareAndSet(false, true)) {
        threadPool
            .generic()
            .execute(
                new AbstractRunnable() {
                  @Override
                  public void onFailure(Exception e) {
                    logger.warn("Recovery failed", e);
                    // we reset `recovered` in the listener don't reset it here otherwise there
                    // might be a race
                    // that resets it to false while a new recover is already running?
                    recoveryListener.onFailure("state recovery failed: " + e.getMessage());
                  }

                  @Override
                  protected void doRun() throws Exception {
                    gateway.performStateRecovery(recoveryListener);
                  }
                });
      }
    }
  }

  public Gateway getGateway() {
    return gateway;
  }

  class GatewayRecoveryListener implements Gateway.GatewayStateRecoveredListener {

    @Override
    public void onSuccess(final ClusterState recoveredState) {
      logger.trace("successful state recovery, importing cluster state...");
      clusterService.submitStateUpdateTask(
          "local-gateway-elected-state",
          new ClusterStateUpdateTask() {
            @Override
            public ClusterState execute(ClusterState currentState) {
              assert currentState.metaData().indices().isEmpty();

              // remove the block, since we recovered from gateway
              ClusterBlocks.Builder blocks =
                  ClusterBlocks.builder()
                      .blocks(currentState.blocks())
                      .blocks(recoveredState.blocks())
                      .removeGlobalBlock(STATE_NOT_RECOVERED_BLOCK);

              MetaData.Builder metaDataBuilder = MetaData.builder(recoveredState.metaData());
              // automatically generate a UID for the metadata if we need to
              metaDataBuilder.generateClusterUuidIfNeeded();

              if (MetaData.SETTING_READ_ONLY_SETTING.get(recoveredState.metaData().settings())
                  || MetaData.SETTING_READ_ONLY_SETTING.get(currentState.metaData().settings())) {
                blocks.addGlobalBlock(MetaData.CLUSTER_READ_ONLY_BLOCK);
              }

              for (IndexMetaData indexMetaData : recoveredState.metaData()) {
                metaDataBuilder.put(indexMetaData, false);
                blocks.addBlocks(indexMetaData);
              }

              // update the state to reflect the new metadata and routing
              ClusterState updatedState =
                  ClusterState.builder(currentState)
                      .blocks(blocks)
                      .metaData(metaDataBuilder)
                      .build();

              // initialize all index routing tables as empty
              RoutingTable.Builder routingTableBuilder =
                  RoutingTable.builder(updatedState.routingTable());
              for (ObjectCursor<IndexMetaData> cursor :
                  updatedState.metaData().indices().values()) {
                routingTableBuilder.addAsRecovery(cursor.value);
              }
              // start with 0 based versions for routing table
              routingTableBuilder.version(0);

              // now, reroute
              updatedState =
                  ClusterState.builder(updatedState)
                      .routingTable(routingTableBuilder.build())
                      .build();
              return allocationService.reroute(updatedState, "state recovered");
            }

            @Override
            public void onFailure(String source, Exception e) {
              logger.error(
                  (Supplier<?>)
                      () -> new ParameterizedMessage("unexpected failure during [{}]", source),
                  e);
              GatewayRecoveryListener.this.onFailure("failed to updated cluster state");
            }

            @Override
            public void clusterStateProcessed(
                String source, ClusterState oldState, ClusterState newState) {
              logger.info(
                  "recovered [{}] indices into cluster_state",
                  newState.metaData().indices().size());
            }
          });
    }

    @Override
    public void onFailure(String message) {
      recovered.set(false);
      scheduledRecovery.set(false);
      // don't remove the block here, we don't want to allow anything in such a case
      logger.info("metadata state not restored, reason: {}", message);
    }
  }

  // used for testing
  public TimeValue recoverAfterTime() {
    return recoverAfterTime;
  }
}
public final class HttpTransportSettings {

  public static final Setting<Boolean> SETTING_CORS_ENABLED =
      Setting.boolSetting("http.cors.enabled", false, false, Scope.CLUSTER);
  public static final Setting<String> SETTING_CORS_ALLOW_ORIGIN =
      new Setting<String>("http.cors.allow-origin", "", (value) -> value, false, Scope.CLUSTER);
  public static final Setting<Integer> SETTING_CORS_MAX_AGE =
      Setting.intSetting("http.cors.max-age", 1728000, false, Scope.CLUSTER);
  public static final Setting<String> SETTING_CORS_ALLOW_METHODS =
      new Setting<String>(
          "http.cors.allow-methods",
          "OPTIONS, HEAD, GET, POST, PUT, DELETE",
          (value) -> value,
          false,
          Scope.CLUSTER);
  public static final Setting<String> SETTING_CORS_ALLOW_HEADERS =
      new Setting<String>(
          "http.cors.allow-headers",
          "X-Requested-With, Content-Type, Content-Length",
          (value) -> value,
          false,
          Scope.CLUSTER);
  public static final Setting<Boolean> SETTING_CORS_ALLOW_CREDENTIALS =
      Setting.boolSetting("http.cors.allow-credentials", false, false, Scope.CLUSTER);
  public static final Setting<Boolean> SETTING_PIPELINING =
      Setting.boolSetting("http.pipelining", true, false, Scope.CLUSTER);
  public static final Setting<Integer> SETTING_PIPELINING_MAX_EVENTS =
      Setting.intSetting("http.pipelining.max_events", 10000, false, Scope.CLUSTER);
  public static final Setting<Boolean> SETTING_HTTP_COMPRESSION =
      Setting.boolSetting("http.compression", false, false, Scope.CLUSTER);
  public static final Setting<Integer> SETTING_HTTP_COMPRESSION_LEVEL =
      Setting.intSetting("http.compression_level", 6, false, Scope.CLUSTER);
  public static final Setting<List<String>> SETTING_HTTP_HOST =
      listSetting("http.host", emptyList(), s -> s, false, Scope.CLUSTER);
  public static final Setting<List<String>> SETTING_HTTP_PUBLISH_HOST =
      listSetting("http.publish_host", SETTING_HTTP_HOST, s -> s, false, Scope.CLUSTER);
  public static final Setting<List<String>> SETTING_HTTP_BIND_HOST =
      listSetting("http.bind_host", SETTING_HTTP_HOST, s -> s, false, Scope.CLUSTER);

  public static final Setting<PortsRange> SETTING_HTTP_PORT =
      new Setting<PortsRange>("http.port", "9200-9300", PortsRange::new, false, Scope.CLUSTER);
  public static final Setting<Integer> SETTING_HTTP_PUBLISH_PORT =
      Setting.intSetting("http.publish_port", 0, 0, false, Scope.CLUSTER);
  public static final Setting<Boolean> SETTING_HTTP_DETAILED_ERRORS_ENABLED =
      Setting.boolSetting("http.detailed_errors.enabled", true, false, Scope.CLUSTER);
  public static final Setting<ByteSizeValue> SETTING_HTTP_MAX_CONTENT_LENGTH =
      Setting.byteSizeSetting(
          "http.max_content_length", new ByteSizeValue(100, ByteSizeUnit.MB), false, Scope.CLUSTER);
  public static final Setting<ByteSizeValue> SETTING_HTTP_MAX_CHUNK_SIZE =
      Setting.byteSizeSetting(
          "http.max_chunk_size", new ByteSizeValue(8, ByteSizeUnit.KB), false, Scope.CLUSTER);
  public static final Setting<ByteSizeValue> SETTING_HTTP_MAX_HEADER_SIZE =
      Setting.byteSizeSetting(
          "http.max_header_size", new ByteSizeValue(8, ByteSizeUnit.KB), false, Scope.CLUSTER);
  public static final Setting<ByteSizeValue> SETTING_HTTP_MAX_INITIAL_LINE_LENGTH =
      Setting.byteSizeSetting(
          "http.max_initial_line_length",
          new ByteSizeValue(4, ByteSizeUnit.KB),
          false,
          Scope.CLUSTER);
  // don't reset cookies by default, since I don't think we really need to
  // note, parsing cookies was fixed in netty 3.5.1 regarding stack allocation, but still,
  // currently, we don't need cookies
  public static final Setting<Boolean> SETTING_HTTP_RESET_COOKIES =
      Setting.boolSetting("http.reset_cookies", false, false, Scope.CLUSTER);

  private HttpTransportSettings() {}
}
예제 #6
0
/**
 * A base class for {@link org.elasticsearch.discovery.zen.fd.MasterFaultDetection} &amp; {@link
 * org.elasticsearch.discovery.zen.fd.NodesFaultDetection}, making sure both use the same setting.
 */
public abstract class FaultDetection extends AbstractComponent {

  public static final Setting<Boolean> CONNECT_ON_NETWORK_DISCONNECT_SETTING =
      Setting.boolSetting(
          "discovery.zen.fd.connect_on_network_disconnect", false, Property.NodeScope);
  public static final Setting<TimeValue> PING_INTERVAL_SETTING =
      Setting.positiveTimeSetting(
          "discovery.zen.fd.ping_interval", timeValueSeconds(1), Property.NodeScope);
  public static final Setting<TimeValue> PING_TIMEOUT_SETTING =
      Setting.timeSetting(
          "discovery.zen.fd.ping_timeout", timeValueSeconds(30), Property.NodeScope);
  public static final Setting<Integer> PING_RETRIES_SETTING =
      Setting.intSetting("discovery.zen.fd.ping_retries", 3, Property.NodeScope);
  public static final Setting<Boolean> REGISTER_CONNECTION_LISTENER_SETTING =
      Setting.boolSetting(
          "discovery.zen.fd.register_connection_listener", true, Property.NodeScope);

  protected final ThreadPool threadPool;
  protected final ClusterName clusterName;
  protected final TransportService transportService;

  // used mainly for testing, should always be true
  protected final boolean registerConnectionListener;
  protected final FDConnectionListener connectionListener;
  protected final boolean connectOnNetworkDisconnect;

  protected final TimeValue pingInterval;
  protected final TimeValue pingRetryTimeout;
  protected final int pingRetryCount;

  public FaultDetection(
      Settings settings,
      ThreadPool threadPool,
      TransportService transportService,
      ClusterName clusterName) {
    super(settings);
    this.threadPool = threadPool;
    this.transportService = transportService;
    this.clusterName = clusterName;

    this.connectOnNetworkDisconnect = CONNECT_ON_NETWORK_DISCONNECT_SETTING.get(settings);
    this.pingInterval = PING_INTERVAL_SETTING.get(settings);
    this.pingRetryTimeout = PING_TIMEOUT_SETTING.get(settings);
    this.pingRetryCount = PING_RETRIES_SETTING.get(settings);
    this.registerConnectionListener = REGISTER_CONNECTION_LISTENER_SETTING.get(settings);

    this.connectionListener = new FDConnectionListener();
    if (registerConnectionListener) {
      transportService.addConnectionListener(connectionListener);
    }
  }

  public void close() {
    transportService.removeConnectionListener(connectionListener);
  }

  /**
   * This method will be called when the {@link org.elasticsearch.transport.TransportService} raised
   * a node disconnected event
   */
  abstract void handleTransportDisconnect(DiscoveryNode node);

  private class FDConnectionListener implements TransportConnectionListener {
    @Override
    public void onNodeConnected(DiscoveryNode node) {}

    @Override
    public void onNodeDisconnected(DiscoveryNode node) {
      handleTransportDisconnect(node);
    }
  }
}
예제 #7
0
public class IndexMetaData
    implements Diffable<IndexMetaData>, FromXContentBuilder<IndexMetaData>, ToXContent {

  public interface Custom extends Diffable<Custom>, ToXContent {

    String type();

    Custom fromMap(Map<String, Object> map) throws IOException;

    Custom fromXContent(XContentParser parser) throws IOException;

    /**
     * Merges from this to another, with this being more important, i.e., if something exists in
     * this and another, this will prevail.
     */
    Custom mergeWith(Custom another);
  }

  public static Map<String, Custom> customPrototypes = new HashMap<>();

  /** Register a custom index meta data factory. Make sure to call it from a static block. */
  public static void registerPrototype(String type, Custom proto) {
    customPrototypes.put(type, proto);
  }

  @Nullable
  public static <T extends Custom> T lookupPrototype(String type) {
    //noinspection unchecked
    return (T) customPrototypes.get(type);
  }

  public static <T extends Custom> T lookupPrototypeSafe(String type) {
    //noinspection unchecked
    T proto = (T) customPrototypes.get(type);
    if (proto == null) {
      throw new IllegalArgumentException(
          "No custom metadata prototype registered for type [" + type + "]");
    }
    return proto;
  }

  public static final ClusterBlock INDEX_READ_ONLY_BLOCK =
      new ClusterBlock(
          5,
          "index read-only (api)",
          false,
          false,
          RestStatus.FORBIDDEN,
          EnumSet.of(ClusterBlockLevel.WRITE, ClusterBlockLevel.METADATA_WRITE));
  public static final ClusterBlock INDEX_READ_BLOCK =
      new ClusterBlock(
          7,
          "index read (api)",
          false,
          false,
          RestStatus.FORBIDDEN,
          EnumSet.of(ClusterBlockLevel.READ));
  public static final ClusterBlock INDEX_WRITE_BLOCK =
      new ClusterBlock(
          8,
          "index write (api)",
          false,
          false,
          RestStatus.FORBIDDEN,
          EnumSet.of(ClusterBlockLevel.WRITE));
  public static final ClusterBlock INDEX_METADATA_BLOCK =
      new ClusterBlock(
          9,
          "index metadata (api)",
          false,
          false,
          RestStatus.FORBIDDEN,
          EnumSet.of(ClusterBlockLevel.METADATA_WRITE, ClusterBlockLevel.METADATA_READ));

  public static enum State {
    OPEN((byte) 0),
    CLOSE((byte) 1);

    private final byte id;

    State(byte id) {
      this.id = id;
    }

    public byte id() {
      return this.id;
    }

    public static State fromId(byte id) {
      if (id == 0) {
        return OPEN;
      } else if (id == 1) {
        return CLOSE;
      }
      throw new IllegalStateException("No state match for id [" + id + "]");
    }

    public static State fromString(String state) {
      if ("open".equals(state)) {
        return OPEN;
      } else if ("close".equals(state)) {
        return CLOSE;
      }
      throw new IllegalStateException("No state match for [" + state + "]");
    }
  }

  public static final String INDEX_SETTING_PREFIX = "index.";
  public static final String SETTING_NUMBER_OF_SHARDS = "index.number_of_shards";
  public static final Setting<Integer> INDEX_NUMBER_OF_SHARDS_SETTING =
      Setting.intSetting(SETTING_NUMBER_OF_SHARDS, 5, 1, false, Setting.Scope.INDEX);
  public static final String SETTING_NUMBER_OF_REPLICAS = "index.number_of_replicas";
  public static final Setting<Integer> INDEX_NUMBER_OF_REPLICAS_SETTING =
      Setting.intSetting(SETTING_NUMBER_OF_REPLICAS, 1, 0, true, Setting.Scope.INDEX);
  public static final String SETTING_SHADOW_REPLICAS = "index.shadow_replicas";
  public static final Setting<Boolean> INDEX_SHADOW_REPLICAS_SETTING =
      Setting.boolSetting(SETTING_SHADOW_REPLICAS, false, false, Setting.Scope.INDEX);

  public static final String SETTING_SHARED_FILESYSTEM = "index.shared_filesystem";
  public static final Setting<Boolean> INDEX_SHARED_FILESYSTEM_SETTING =
      Setting.boolSetting(SETTING_SHARED_FILESYSTEM, false, false, Setting.Scope.INDEX);

  public static final String SETTING_AUTO_EXPAND_REPLICAS = "index.auto_expand_replicas";
  public static final Setting<AutoExpandReplicas> INDEX_AUTO_EXPAND_REPLICAS_SETTING =
      AutoExpandReplicas.SETTING;
  public static final String SETTING_READ_ONLY = "index.blocks.read_only";
  public static final Setting<Boolean> INDEX_READ_ONLY_SETTING =
      Setting.boolSetting(SETTING_READ_ONLY, false, true, Setting.Scope.INDEX);

  public static final String SETTING_BLOCKS_READ = "index.blocks.read";
  public static final Setting<Boolean> INDEX_BLOCKS_READ_SETTING =
      Setting.boolSetting(SETTING_BLOCKS_READ, false, true, Setting.Scope.INDEX);

  public static final String SETTING_BLOCKS_WRITE = "index.blocks.write";
  public static final Setting<Boolean> INDEX_BLOCKS_WRITE_SETTING =
      Setting.boolSetting(SETTING_BLOCKS_WRITE, false, true, Setting.Scope.INDEX);

  public static final String SETTING_BLOCKS_METADATA = "index.blocks.metadata";
  public static final Setting<Boolean> INDEX_BLOCKS_METADATA_SETTING =
      Setting.boolSetting(SETTING_BLOCKS_METADATA, false, true, Setting.Scope.INDEX);

  public static final String SETTING_VERSION_CREATED = "index.version.created";
  public static final String SETTING_VERSION_CREATED_STRING = "index.version.created_string";
  public static final String SETTING_VERSION_UPGRADED = "index.version.upgraded";
  public static final String SETTING_VERSION_UPGRADED_STRING = "index.version.upgraded_string";
  public static final String SETTING_VERSION_MINIMUM_COMPATIBLE =
      "index.version.minimum_compatible";
  public static final String SETTING_CREATION_DATE = "index.creation_date";
  public static final String SETTING_PRIORITY = "index.priority";
  public static final Setting<Integer> INDEX_PRIORITY_SETTING =
      Setting.intSetting("index.priority", 1, 0, true, Setting.Scope.INDEX);
  public static final String SETTING_CREATION_DATE_STRING = "index.creation_date_string";
  public static final String SETTING_INDEX_UUID = "index.uuid";
  public static final String SETTING_DATA_PATH = "index.data_path";
  public static final Setting<String> INDEX_DATA_PATH_SETTING =
      new Setting<>(SETTING_DATA_PATH, "", Function.identity(), false, Setting.Scope.INDEX);
  public static final String SETTING_SHARED_FS_ALLOW_RECOVERY_ON_ANY_NODE =
      "index.shared_filesystem.recover_on_any_node";
  public static final Setting<Boolean> INDEX_SHARED_FS_ALLOW_RECOVERY_ON_ANY_NODE_SETTING =
      Setting.boolSetting(
          SETTING_SHARED_FS_ALLOW_RECOVERY_ON_ANY_NODE, false, true, Setting.Scope.INDEX);
  public static final String INDEX_UUID_NA_VALUE = "_na_";

  public static final Setting<Settings> INDEX_ROUTING_REQUIRE_GROUP_SETTING =
      Setting.groupSetting("index.routing.allocation.require.", true, Setting.Scope.INDEX);
  public static final Setting<Settings> INDEX_ROUTING_INCLUDE_GROUP_SETTING =
      Setting.groupSetting("index.routing.allocation.include.", true, Setting.Scope.INDEX);
  public static final Setting<Settings> INDEX_ROUTING_EXCLUDE_GROUP_SETTING =
      Setting.groupSetting("index.routing.allocation.exclude.", true, Setting.Scope.INDEX);

  public static final IndexMetaData PROTO =
      IndexMetaData.builder("")
          .settings(Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT))
          .numberOfShards(1)
          .numberOfReplicas(0)
          .build();

  public static final String KEY_ACTIVE_ALLOCATIONS = "active_allocations";

  private final int numberOfShards;
  private final int numberOfReplicas;

  private final Index index;
  private final long version;

  private final State state;

  private final ImmutableOpenMap<String, AliasMetaData> aliases;

  private final Settings settings;

  private final ImmutableOpenMap<String, MappingMetaData> mappings;

  private final ImmutableOpenMap<String, Custom> customs;

  private final ImmutableOpenIntMap<Set<String>> activeAllocationIds;

  private final transient int totalNumberOfShards;

  private final DiscoveryNodeFilters requireFilters;
  private final DiscoveryNodeFilters includeFilters;
  private final DiscoveryNodeFilters excludeFilters;

  private final Version indexCreatedVersion;
  private final Version indexUpgradedVersion;
  private final org.apache.lucene.util.Version minimumCompatibleLuceneVersion;

  private IndexMetaData(
      Index index,
      long version,
      State state,
      int numberOfShards,
      int numberOfReplicas,
      Settings settings,
      ImmutableOpenMap<String, MappingMetaData> mappings,
      ImmutableOpenMap<String, AliasMetaData> aliases,
      ImmutableOpenMap<String, Custom> customs,
      ImmutableOpenIntMap<Set<String>> activeAllocationIds,
      DiscoveryNodeFilters requireFilters,
      DiscoveryNodeFilters includeFilters,
      DiscoveryNodeFilters excludeFilters,
      Version indexCreatedVersion,
      Version indexUpgradedVersion,
      org.apache.lucene.util.Version minimumCompatibleLuceneVersion) {

    this.index = index;
    this.version = version;
    this.state = state;
    this.numberOfShards = numberOfShards;
    this.numberOfReplicas = numberOfReplicas;
    this.totalNumberOfShards = numberOfShards * (numberOfReplicas + 1);
    this.settings = settings;
    this.mappings = mappings;
    this.customs = customs;
    this.aliases = aliases;
    this.activeAllocationIds = activeAllocationIds;
    this.requireFilters = requireFilters;
    this.includeFilters = includeFilters;
    this.excludeFilters = excludeFilters;
    this.indexCreatedVersion = indexCreatedVersion;
    this.indexUpgradedVersion = indexUpgradedVersion;
    this.minimumCompatibleLuceneVersion = minimumCompatibleLuceneVersion;
  }

  public Index getIndex() {
    return index;
  }

  public String getIndexUUID() {
    return index.getUUID();
  }

  /**
   * Test whether the current index UUID is the same as the given one. Returns true if either are
   * _na_
   */
  public boolean isSameUUID(String otherUUID) {
    assert otherUUID != null;
    assert getIndexUUID() != null;
    if (INDEX_UUID_NA_VALUE.equals(otherUUID) || INDEX_UUID_NA_VALUE.equals(getIndexUUID())) {
      return true;
    }
    return otherUUID.equals(getIndexUUID());
  }

  public long getVersion() {
    return this.version;
  }

  /**
   * Return the {@link Version} on which this index has been created. This information is typically
   * useful for backward compatibility.
   */
  public Version getCreationVersion() {
    return indexCreatedVersion;
  }

  /**
   * Return the {@link Version} on which this index has been upgraded. This information is typically
   * useful for backward compatibility.
   */
  public Version getUpgradedVersion() {
    return indexUpgradedVersion;
  }

  /** Return the {@link org.apache.lucene.util.Version} of the oldest lucene segment in the index */
  public org.apache.lucene.util.Version getMinimumCompatibleVersion() {
    return minimumCompatibleLuceneVersion;
  }

  public long getCreationDate() {
    return settings.getAsLong(SETTING_CREATION_DATE, -1l);
  }

  public State getState() {
    return this.state;
  }

  public int getNumberOfShards() {
    return numberOfShards;
  }

  public int getNumberOfReplicas() {
    return numberOfReplicas;
  }

  public int getTotalNumberOfShards() {
    return totalNumberOfShards;
  }

  public Settings getSettings() {
    return settings;
  }

  public ImmutableOpenMap<String, AliasMetaData> getAliases() {
    return this.aliases;
  }

  public ImmutableOpenMap<String, MappingMetaData> getMappings() {
    return mappings;
  }

  @Nullable
  public MappingMetaData mapping(String mappingType) {
    return mappings.get(mappingType);
  }

  /**
   * Sometimes, the default mapping exists and an actual mapping is not created yet (introduced), in
   * this case, we want to return the default mapping in case it has some default mapping
   * definitions.
   *
   * <p>Note, once the mapping type is introduced, the default mapping is applied on the actual
   * typed MappingMetaData, setting its routing, timestamp, and so on if needed.
   */
  @Nullable
  public MappingMetaData mappingOrDefault(String mappingType) {
    MappingMetaData mapping = mappings.get(mappingType);
    if (mapping != null) {
      return mapping;
    }
    return mappings.get(MapperService.DEFAULT_MAPPING);
  }

  public ImmutableOpenMap<String, Custom> getCustoms() {
    return this.customs;
  }

  @SuppressWarnings("unchecked")
  public <T extends Custom> T custom(String type) {
    return (T) customs.get(type);
  }

  public ImmutableOpenIntMap<Set<String>> getActiveAllocationIds() {
    return activeAllocationIds;
  }

  public Set<String> activeAllocationIds(int shardId) {
    assert shardId >= 0 && shardId < numberOfShards;
    return activeAllocationIds.get(shardId);
  }

  @Nullable
  public DiscoveryNodeFilters requireFilters() {
    return requireFilters;
  }

  @Nullable
  public DiscoveryNodeFilters includeFilters() {
    return includeFilters;
  }

  @Nullable
  public DiscoveryNodeFilters excludeFilters() {
    return excludeFilters;
  }

  @Override
  public boolean equals(Object o) {
    if (this == o) {
      return true;
    }
    if (o == null || getClass() != o.getClass()) {
      return false;
    }

    IndexMetaData that = (IndexMetaData) o;

    if (!aliases.equals(that.aliases)) {
      return false;
    }
    if (!index.equals(that.index)) {
      return false;
    }
    if (!mappings.equals(that.mappings)) {
      return false;
    }
    if (!settings.equals(that.settings)) {
      return false;
    }
    if (state != that.state) {
      return false;
    }
    if (!customs.equals(that.customs)) {
      return false;
    }
    if (!activeAllocationIds.equals(that.activeAllocationIds)) {
      return false;
    }
    return true;
  }

  @Override
  public int hashCode() {
    int result = index.hashCode();
    result = 31 * result + state.hashCode();
    result = 31 * result + aliases.hashCode();
    result = 31 * result + settings.hashCode();
    result = 31 * result + mappings.hashCode();
    result = 31 * result + activeAllocationIds.hashCode();
    return result;
  }

  @Override
  public Diff<IndexMetaData> diff(IndexMetaData previousState) {
    return new IndexMetaDataDiff(previousState, this);
  }

  @Override
  public Diff<IndexMetaData> readDiffFrom(StreamInput in) throws IOException {
    return new IndexMetaDataDiff(in);
  }

  @Override
  public IndexMetaData fromXContent(XContentParser parser, ParseFieldMatcher parseFieldMatcher)
      throws IOException {
    return Builder.fromXContent(parser);
  }

  @Override
  public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
    Builder.toXContent(this, builder, params);
    return builder;
  }

  private static class IndexMetaDataDiff implements Diff<IndexMetaData> {

    private final String index;
    private final long version;
    private final State state;
    private final Settings settings;
    private final Diff<ImmutableOpenMap<String, MappingMetaData>> mappings;
    private final Diff<ImmutableOpenMap<String, AliasMetaData>> aliases;
    private final Diff<ImmutableOpenMap<String, Custom>> customs;
    private final Diff<ImmutableOpenIntMap<Set<String>>> activeAllocationIds;

    public IndexMetaDataDiff(IndexMetaData before, IndexMetaData after) {
      index = after.index.getName();
      version = after.version;
      state = after.state;
      settings = after.settings;
      mappings =
          DiffableUtils.diff(
              before.mappings, after.mappings, DiffableUtils.getStringKeySerializer());
      aliases =
          DiffableUtils.diff(before.aliases, after.aliases, DiffableUtils.getStringKeySerializer());
      customs =
          DiffableUtils.diff(before.customs, after.customs, DiffableUtils.getStringKeySerializer());
      activeAllocationIds =
          DiffableUtils.diff(
              before.activeAllocationIds,
              after.activeAllocationIds,
              DiffableUtils.getVIntKeySerializer(),
              DiffableUtils.StringSetValueSerializer.getInstance());
    }

    public IndexMetaDataDiff(StreamInput in) throws IOException {
      index = in.readString();
      version = in.readLong();
      state = State.fromId(in.readByte());
      settings = Settings.readSettingsFromStream(in);
      mappings =
          DiffableUtils.readImmutableOpenMapDiff(
              in, DiffableUtils.getStringKeySerializer(), MappingMetaData.PROTO);
      aliases =
          DiffableUtils.readImmutableOpenMapDiff(
              in, DiffableUtils.getStringKeySerializer(), AliasMetaData.PROTO);
      customs =
          DiffableUtils.readImmutableOpenMapDiff(
              in,
              DiffableUtils.getStringKeySerializer(),
              new DiffableUtils.DiffableValueSerializer<String, Custom>() {
                @Override
                public Custom read(StreamInput in, String key) throws IOException {
                  return lookupPrototypeSafe(key).readFrom(in);
                }

                @Override
                public Diff<Custom> readDiff(StreamInput in, String key) throws IOException {
                  return lookupPrototypeSafe(key).readDiffFrom(in);
                }
              });
      activeAllocationIds =
          DiffableUtils.readImmutableOpenIntMapDiff(
              in,
              DiffableUtils.getVIntKeySerializer(),
              DiffableUtils.StringSetValueSerializer.getInstance());
    }

    @Override
    public void writeTo(StreamOutput out) throws IOException {
      out.writeString(index);
      out.writeLong(version);
      out.writeByte(state.id);
      Settings.writeSettingsToStream(settings, out);
      mappings.writeTo(out);
      aliases.writeTo(out);
      customs.writeTo(out);
      activeAllocationIds.writeTo(out);
    }

    @Override
    public IndexMetaData apply(IndexMetaData part) {
      Builder builder = builder(index);
      builder.version(version);
      builder.state(state);
      builder.settings(settings);
      builder.mappings.putAll(mappings.apply(part.mappings));
      builder.aliases.putAll(aliases.apply(part.aliases));
      builder.customs.putAll(customs.apply(part.customs));
      builder.activeAllocationIds.putAll(activeAllocationIds.apply(part.activeAllocationIds));
      return builder.build();
    }
  }

  @Override
  public IndexMetaData readFrom(StreamInput in) throws IOException {
    Builder builder = new Builder(in.readString());
    builder.version(in.readLong());
    builder.state(State.fromId(in.readByte()));
    builder.settings(readSettingsFromStream(in));
    int mappingsSize = in.readVInt();
    for (int i = 0; i < mappingsSize; i++) {
      MappingMetaData mappingMd = MappingMetaData.PROTO.readFrom(in);
      builder.putMapping(mappingMd);
    }
    int aliasesSize = in.readVInt();
    for (int i = 0; i < aliasesSize; i++) {
      AliasMetaData aliasMd = AliasMetaData.Builder.readFrom(in);
      builder.putAlias(aliasMd);
    }
    int customSize = in.readVInt();
    for (int i = 0; i < customSize; i++) {
      String type = in.readString();
      Custom customIndexMetaData = lookupPrototypeSafe(type).readFrom(in);
      builder.putCustom(type, customIndexMetaData);
    }
    int activeAllocationIdsSize = in.readVInt();
    for (int i = 0; i < activeAllocationIdsSize; i++) {
      int key = in.readVInt();
      Set<String> allocationIds =
          DiffableUtils.StringSetValueSerializer.getInstance().read(in, key);
      builder.putActiveAllocationIds(key, allocationIds);
    }
    return builder.build();
  }

  @Override
  public void writeTo(StreamOutput out) throws IOException {
    out.writeString(index.getName()); // uuid will come as part of settings
    out.writeLong(version);
    out.writeByte(state.id());
    writeSettingsToStream(settings, out);
    out.writeVInt(mappings.size());
    for (ObjectCursor<MappingMetaData> cursor : mappings.values()) {
      cursor.value.writeTo(out);
    }
    out.writeVInt(aliases.size());
    for (ObjectCursor<AliasMetaData> cursor : aliases.values()) {
      cursor.value.writeTo(out);
    }
    out.writeVInt(customs.size());
    for (ObjectObjectCursor<String, Custom> cursor : customs) {
      out.writeString(cursor.key);
      cursor.value.writeTo(out);
    }
    out.writeVInt(activeAllocationIds.size());
    for (IntObjectCursor<Set<String>> cursor : activeAllocationIds) {
      out.writeVInt(cursor.key);
      DiffableUtils.StringSetValueSerializer.getInstance().write(cursor.value, out);
    }
  }

  public static Builder builder(String index) {
    return new Builder(index);
  }

  public static Builder builder(IndexMetaData indexMetaData) {
    return new Builder(indexMetaData);
  }

  public static class Builder {

    private String index;
    private State state = State.OPEN;
    private long version = 1;
    private Settings settings = Settings.Builder.EMPTY_SETTINGS;
    private final ImmutableOpenMap.Builder<String, MappingMetaData> mappings;
    private final ImmutableOpenMap.Builder<String, AliasMetaData> aliases;
    private final ImmutableOpenMap.Builder<String, Custom> customs;
    private final ImmutableOpenIntMap.Builder<Set<String>> activeAllocationIds;

    public Builder(String index) {
      this.index = index;
      this.mappings = ImmutableOpenMap.builder();
      this.aliases = ImmutableOpenMap.builder();
      this.customs = ImmutableOpenMap.builder();
      this.activeAllocationIds = ImmutableOpenIntMap.builder();
    }

    public Builder(IndexMetaData indexMetaData) {
      this.index = indexMetaData.getIndex().getName();
      this.state = indexMetaData.state;
      this.version = indexMetaData.version;
      this.settings = indexMetaData.getSettings();
      this.mappings = ImmutableOpenMap.builder(indexMetaData.mappings);
      this.aliases = ImmutableOpenMap.builder(indexMetaData.aliases);
      this.customs = ImmutableOpenMap.builder(indexMetaData.customs);
      this.activeAllocationIds = ImmutableOpenIntMap.builder(indexMetaData.activeAllocationIds);
    }

    public String index() {
      return index;
    }

    public Builder index(String index) {
      this.index = index;
      return this;
    }

    public Builder numberOfShards(int numberOfShards) {
      settings =
          settingsBuilder().put(settings).put(SETTING_NUMBER_OF_SHARDS, numberOfShards).build();
      return this;
    }

    public int numberOfShards() {
      return settings.getAsInt(SETTING_NUMBER_OF_SHARDS, -1);
    }

    public Builder numberOfReplicas(int numberOfReplicas) {
      settings =
          settingsBuilder().put(settings).put(SETTING_NUMBER_OF_REPLICAS, numberOfReplicas).build();
      return this;
    }

    public int numberOfReplicas() {
      return settings.getAsInt(SETTING_NUMBER_OF_REPLICAS, -1);
    }

    public Builder creationDate(long creationDate) {
      settings = settingsBuilder().put(settings).put(SETTING_CREATION_DATE, creationDate).build();
      return this;
    }

    public Builder settings(Settings.Builder settings) {
      this.settings = settings.build();
      return this;
    }

    public Builder settings(Settings settings) {
      this.settings = settings;
      return this;
    }

    public MappingMetaData mapping(String type) {
      return mappings.get(type);
    }

    public Builder putMapping(String type, String source) throws IOException {
      try (XContentParser parser = XContentFactory.xContent(source).createParser(source)) {
        putMapping(new MappingMetaData(type, parser.mapOrdered()));
      }
      return this;
    }

    public Builder putMapping(MappingMetaData mappingMd) {
      mappings.put(mappingMd.type(), mappingMd);
      return this;
    }

    public Builder state(State state) {
      this.state = state;
      return this;
    }

    public Builder putAlias(AliasMetaData aliasMetaData) {
      aliases.put(aliasMetaData.alias(), aliasMetaData);
      return this;
    }

    public Builder putAlias(AliasMetaData.Builder aliasMetaData) {
      aliases.put(aliasMetaData.alias(), aliasMetaData.build());
      return this;
    }

    public Builder removeAlias(String alias) {
      aliases.remove(alias);
      return this;
    }

    public Builder removeAllAliases() {
      aliases.clear();
      return this;
    }

    public Builder putCustom(String type, Custom customIndexMetaData) {
      this.customs.put(type, customIndexMetaData);
      return this;
    }

    public Builder putActiveAllocationIds(int shardId, Set<String> allocationIds) {
      activeAllocationIds.put(shardId, new HashSet(allocationIds));
      return this;
    }

    public long version() {
      return this.version;
    }

    public Builder version(long version) {
      this.version = version;
      return this;
    }

    public IndexMetaData build() {
      ImmutableOpenMap.Builder<String, AliasMetaData> tmpAliases = aliases;
      Settings tmpSettings = settings;

      // update default mapping on the MappingMetaData
      if (mappings.containsKey(MapperService.DEFAULT_MAPPING)) {
        MappingMetaData defaultMapping = mappings.get(MapperService.DEFAULT_MAPPING);
        for (ObjectCursor<MappingMetaData> cursor : mappings.values()) {
          cursor.value.updateDefaultMapping(defaultMapping);
        }
      }

      Integer maybeNumberOfShards = settings.getAsInt(SETTING_NUMBER_OF_SHARDS, null);
      if (maybeNumberOfShards == null) {
        throw new IllegalArgumentException("must specify numberOfShards for index [" + index + "]");
      }
      int numberOfShards = maybeNumberOfShards;
      if (numberOfShards <= 0) {
        throw new IllegalArgumentException(
            "must specify positive number of shards for index [" + index + "]");
      }

      Integer maybeNumberOfReplicas = settings.getAsInt(SETTING_NUMBER_OF_REPLICAS, null);
      if (maybeNumberOfReplicas == null) {
        throw new IllegalArgumentException(
            "must specify numberOfReplicas for index [" + index + "]");
      }
      int numberOfReplicas = maybeNumberOfReplicas;
      if (numberOfReplicas < 0) {
        throw new IllegalArgumentException(
            "must specify non-negative number of shards for index [" + index + "]");
      }

      // fill missing slots in activeAllocationIds with empty set if needed and make all entries
      // immutable
      ImmutableOpenIntMap.Builder<Set<String>> filledActiveAllocationIds =
          ImmutableOpenIntMap.builder();
      for (int i = 0; i < numberOfShards; i++) {
        if (activeAllocationIds.containsKey(i)) {
          filledActiveAllocationIds.put(
              i, Collections.unmodifiableSet(new HashSet<>(activeAllocationIds.get(i))));
        } else {
          filledActiveAllocationIds.put(i, Collections.emptySet());
        }
      }
      final Map<String, String> requireMap =
          INDEX_ROUTING_REQUIRE_GROUP_SETTING.get(settings).getAsMap();
      final DiscoveryNodeFilters requireFilters;
      if (requireMap.isEmpty()) {
        requireFilters = null;
      } else {
        requireFilters = DiscoveryNodeFilters.buildFromKeyValue(AND, requireMap);
      }
      Map<String, String> includeMap = INDEX_ROUTING_INCLUDE_GROUP_SETTING.get(settings).getAsMap();
      final DiscoveryNodeFilters includeFilters;
      if (includeMap.isEmpty()) {
        includeFilters = null;
      } else {
        includeFilters = DiscoveryNodeFilters.buildFromKeyValue(OR, includeMap);
      }
      Map<String, String> excludeMap = INDEX_ROUTING_EXCLUDE_GROUP_SETTING.get(settings).getAsMap();
      final DiscoveryNodeFilters excludeFilters;
      if (excludeMap.isEmpty()) {
        excludeFilters = null;
      } else {
        excludeFilters = DiscoveryNodeFilters.buildFromKeyValue(OR, excludeMap);
      }
      Version indexCreatedVersion = Version.indexCreated(settings);
      Version indexUpgradedVersion =
          settings.getAsVersion(IndexMetaData.SETTING_VERSION_UPGRADED, indexCreatedVersion);
      String stringLuceneVersion = settings.get(SETTING_VERSION_MINIMUM_COMPATIBLE);
      final org.apache.lucene.util.Version minimumCompatibleLuceneVersion;
      if (stringLuceneVersion != null) {
        try {
          minimumCompatibleLuceneVersion =
              org.apache.lucene.util.Version.parse(stringLuceneVersion);
        } catch (ParseException ex) {
          throw new IllegalStateException(
              "Cannot parse lucene version ["
                  + stringLuceneVersion
                  + "] in the ["
                  + SETTING_VERSION_MINIMUM_COMPATIBLE
                  + "] setting",
              ex);
        }
      } else {
        minimumCompatibleLuceneVersion = null;
      }

      final String uuid = settings.get(SETTING_INDEX_UUID, INDEX_UUID_NA_VALUE);
      return new IndexMetaData(
          new Index(index, uuid),
          version,
          state,
          numberOfShards,
          numberOfReplicas,
          tmpSettings,
          mappings.build(),
          tmpAliases.build(),
          customs.build(),
          filledActiveAllocationIds.build(),
          requireFilters,
          includeFilters,
          excludeFilters,
          indexCreatedVersion,
          indexUpgradedVersion,
          minimumCompatibleLuceneVersion);
    }

    public static void toXContent(
        IndexMetaData indexMetaData, XContentBuilder builder, ToXContent.Params params)
        throws IOException {
      builder.startObject(
          indexMetaData.getIndex().getName(), XContentBuilder.FieldCaseConversion.NONE);

      builder.field("version", indexMetaData.getVersion());
      builder.field("state", indexMetaData.getState().toString().toLowerCase(Locale.ENGLISH));

      boolean binary = params.paramAsBoolean("binary", false);

      builder.startObject("settings");
      for (Map.Entry<String, String> entry : indexMetaData.getSettings().getAsMap().entrySet()) {
        builder.field(entry.getKey(), entry.getValue());
      }
      builder.endObject();

      builder.startArray("mappings");
      for (ObjectObjectCursor<String, MappingMetaData> cursor : indexMetaData.getMappings()) {
        if (binary) {
          builder.value(cursor.value.source().compressed());
        } else {
          byte[] data = cursor.value.source().uncompressed();
          XContentParser parser = XContentFactory.xContent(data).createParser(data);
          Map<String, Object> mapping = parser.mapOrdered();
          parser.close();
          builder.map(mapping);
        }
      }
      builder.endArray();

      for (ObjectObjectCursor<String, Custom> cursor : indexMetaData.getCustoms()) {
        builder.startObject(cursor.key, XContentBuilder.FieldCaseConversion.NONE);
        cursor.value.toXContent(builder, params);
        builder.endObject();
      }

      builder.startObject("aliases");
      for (ObjectCursor<AliasMetaData> cursor : indexMetaData.getAliases().values()) {
        AliasMetaData.Builder.toXContent(cursor.value, builder, params);
      }
      builder.endObject();

      builder.startObject(KEY_ACTIVE_ALLOCATIONS);
      for (IntObjectCursor<Set<String>> cursor : indexMetaData.activeAllocationIds) {
        builder.startArray(String.valueOf(cursor.key));
        for (String allocationId : cursor.value) {
          builder.value(allocationId);
        }
        builder.endArray();
      }
      builder.endObject();

      builder.endObject();
    }

    public static IndexMetaData fromXContent(XContentParser parser) throws IOException {
      if (parser.currentToken() == null) { // fresh parser? move to the first token
        parser.nextToken();
      }
      if (parser.currentToken()
          == XContentParser.Token.START_OBJECT) { // on a start object move to next token
        parser.nextToken();
      }
      if (parser.currentToken() != XContentParser.Token.FIELD_NAME) {
        throw new IllegalArgumentException(
            "expected field name but got a " + parser.currentToken());
      }
      Builder builder = new Builder(parser.currentName());

      String currentFieldName = null;
      XContentParser.Token token = parser.nextToken();
      if (token != XContentParser.Token.START_OBJECT) {
        throw new IllegalArgumentException("expected object but got a " + token);
      }
      while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
        if (token == XContentParser.Token.FIELD_NAME) {
          currentFieldName = parser.currentName();
        } else if (token == XContentParser.Token.START_OBJECT) {
          if ("settings".equals(currentFieldName)) {
            builder.settings(
                Settings.settingsBuilder()
                    .put(SettingsLoader.Helper.loadNestedFromMap(parser.mapOrdered())));
          } else if ("mappings".equals(currentFieldName)) {
            while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
              if (token == XContentParser.Token.FIELD_NAME) {
                currentFieldName = parser.currentName();
              } else if (token == XContentParser.Token.START_OBJECT) {
                String mappingType = currentFieldName;
                Map<String, Object> mappingSource =
                    MapBuilder.<String, Object>newMapBuilder()
                        .put(mappingType, parser.mapOrdered())
                        .map();
                builder.putMapping(new MappingMetaData(mappingType, mappingSource));
              } else {
                throw new IllegalArgumentException("Unexpected token: " + token);
              }
            }
          } else if ("aliases".equals(currentFieldName)) {
            while (parser.nextToken() != XContentParser.Token.END_OBJECT) {
              builder.putAlias(AliasMetaData.Builder.fromXContent(parser));
            }
          } else if (KEY_ACTIVE_ALLOCATIONS.equals(currentFieldName)) {
            while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
              if (token == XContentParser.Token.FIELD_NAME) {
                currentFieldName = parser.currentName();
              } else if (token == XContentParser.Token.START_ARRAY) {
                String shardId = currentFieldName;
                Set<String> allocationIds = new HashSet<>();
                while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
                  if (token == XContentParser.Token.VALUE_STRING) {
                    allocationIds.add(parser.text());
                  }
                }
                builder.putActiveAllocationIds(Integer.valueOf(shardId), allocationIds);
              } else {
                throw new IllegalArgumentException("Unexpected token: " + token);
              }
            }
          } else if ("warmers".equals(currentFieldName)) {
            // TODO: do this in 4.0:
            // throw new IllegalArgumentException("Warmers are not supported anymore - are you
            // upgrading from 1.x?");
            // ignore: warmers have been removed in 3.0 and are
            // simply ignored when upgrading from 2.x
            assert Version.CURRENT.major <= 3;
            parser.skipChildren();
          } else {
            // check if its a custom index metadata
            Custom proto = lookupPrototype(currentFieldName);
            if (proto == null) {
              // TODO warn
              parser.skipChildren();
            } else {
              Custom custom = proto.fromXContent(parser);
              builder.putCustom(custom.type(), custom);
            }
          }
        } else if (token == XContentParser.Token.START_ARRAY) {
          if ("mappings".equals(currentFieldName)) {
            while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
              if (token == XContentParser.Token.VALUE_EMBEDDED_OBJECT) {
                builder.putMapping(
                    new MappingMetaData(new CompressedXContent(parser.binaryValue())));
              } else {
                Map<String, Object> mapping = parser.mapOrdered();
                if (mapping.size() == 1) {
                  String mappingType = mapping.keySet().iterator().next();
                  builder.putMapping(new MappingMetaData(mappingType, mapping));
                }
              }
            }
          } else {
            throw new IllegalArgumentException("Unexpected field for an array " + currentFieldName);
          }
        } else if (token.isValue()) {
          if ("state".equals(currentFieldName)) {
            builder.state(State.fromString(parser.text()));
          } else if ("version".equals(currentFieldName)) {
            builder.version(parser.longValue());
          } else {
            throw new IllegalArgumentException("Unexpected field [" + currentFieldName + "]");
          }
        } else {
          throw new IllegalArgumentException("Unexpected token " + token);
        }
      }
      return builder.build();
    }

    public static IndexMetaData readFrom(StreamInput in) throws IOException {
      return PROTO.readFrom(in);
    }
  }

  /**
   * Returns <code>true</code> iff the given settings indicate that the index associated with these
   * settings allocates it's shards on a shared filesystem. Otherwise <code>false</code>. The
   * default setting for this is the returned value from {@link
   * #isIndexUsingShadowReplicas(org.elasticsearch.common.settings.Settings)}.
   */
  public static boolean isOnSharedFilesystem(Settings settings) {
    return settings.getAsBoolean(SETTING_SHARED_FILESYSTEM, isIndexUsingShadowReplicas(settings));
  }

  /**
   * Returns <code>true</code> iff the given settings indicate that the index associated with these
   * settings uses shadow replicas. Otherwise <code>false</code>. The default setting for this is
   * <code>false</code>.
   */
  public static boolean isIndexUsingShadowReplicas(Settings settings) {
    return settings.getAsBoolean(SETTING_SHADOW_REPLICAS, false);
  }

  /**
   * Adds human readable version and creation date settings. This method is used to display the
   * settings in a human readable format in REST API
   */
  public static Settings addHumanReadableSettings(Settings settings) {
    Settings.Builder builder = Settings.builder().put(settings);
    Version version = settings.getAsVersion(SETTING_VERSION_CREATED, null);
    if (version != null) {
      builder.put(SETTING_VERSION_CREATED_STRING, version.toString());
    }
    Version versionUpgraded = settings.getAsVersion(SETTING_VERSION_UPGRADED, null);
    if (versionUpgraded != null) {
      builder.put(SETTING_VERSION_UPGRADED_STRING, versionUpgraded.toString());
    }
    Long creationDate = settings.getAsLong(SETTING_CREATION_DATE, null);
    if (creationDate != null) {
      DateTime creationDateTime = new DateTime(creationDate, DateTimeZone.UTC);
      builder.put(SETTING_CREATION_DATE_STRING, creationDateTime.toString());
    }
    return builder.build();
  }
}
/** A component that holds all data paths for a single node. */
public final class NodeEnvironment implements Closeable {

  private final Logger logger;

  public static class NodePath {
    /* ${data.paths}/nodes/{node.id} */
    public final Path path;
    /* ${data.paths}/nodes/{node.id}/indices */
    public final Path indicesPath;
    /** Cached FileStore from path */
    public final FileStore fileStore;
    /**
     * Cached result of Lucene's {@code IOUtils.spins} on path. This is a trilean value: null means
     * we could not determine it (we are not running on Linux, or we hit an exception trying), True
     * means the device possibly spins and False means it does not.
     */
    public final Boolean spins;

    public final int majorDeviceNumber;
    public final int minorDeviceNumber;

    public NodePath(Path path) throws IOException {
      this.path = path;
      this.indicesPath = path.resolve(INDICES_FOLDER);
      this.fileStore = Environment.getFileStore(path);
      if (fileStore.supportsFileAttributeView("lucene")) {
        this.spins = (Boolean) fileStore.getAttribute("lucene:spins");
        this.majorDeviceNumber = (int) fileStore.getAttribute("lucene:major_device_number");
        this.minorDeviceNumber = (int) fileStore.getAttribute("lucene:minor_device_number");
      } else {
        this.spins = null;
        this.majorDeviceNumber = -1;
        this.minorDeviceNumber = -1;
      }
    }

    /**
     * Resolves the given shards directory against this NodePath
     * ${data.paths}/nodes/{node.id}/indices/{index.uuid}/{shard.id}
     */
    public Path resolve(ShardId shardId) {
      return resolve(shardId.getIndex()).resolve(Integer.toString(shardId.id()));
    }

    /**
     * Resolves index directory against this NodePath
     * ${data.paths}/nodes/{node.id}/indices/{index.uuid}
     */
    public Path resolve(Index index) {
      return indicesPath.resolve(index.getUUID());
    }

    @Override
    public String toString() {
      return "NodePath{" + "path=" + path + ", spins=" + spins + '}';
    }
  }

  private final NodePath[] nodePaths;
  private final Path sharedDataPath;
  private final Lock[] locks;

  private final int nodeLockId;
  private final AtomicBoolean closed = new AtomicBoolean(false);
  private final Map<ShardId, InternalShardLock> shardLocks = new HashMap<>();

  private final NodeMetaData nodeMetaData;

  /** Maximum number of data nodes that should run in an environment. */
  public static final Setting<Integer> MAX_LOCAL_STORAGE_NODES_SETTING =
      Setting.intSetting("node.max_local_storage_nodes", 1, 1, Property.NodeScope);

  /** If true automatically append node lock id to custom data paths. */
  public static final Setting<Boolean> ADD_NODE_LOCK_ID_TO_CUSTOM_PATH =
      Setting.boolSetting("node.add_lock_id_to_custom_path", true, Property.NodeScope);

  /**
   * Seed for determining a persisted unique uuid of this node. If the node has already a persisted
   * uuid on disk, this seed will be ignored and the uuid from disk will be reused.
   */
  public static final Setting<Long> NODE_ID_SEED_SETTING =
      Setting.longSetting("node.id.seed", 0L, Long.MIN_VALUE, Property.NodeScope);

  /** If true the [verbose] SegmentInfos.infoStream logging is sent to System.out. */
  public static final Setting<Boolean> ENABLE_LUCENE_SEGMENT_INFOS_TRACE_SETTING =
      Setting.boolSetting("node.enable_lucene_segment_infos_trace", false, Property.NodeScope);

  public static final String NODES_FOLDER = "nodes";
  public static final String INDICES_FOLDER = "indices";
  public static final String NODE_LOCK_FILENAME = "node.lock";

  public NodeEnvironment(Settings settings, Environment environment) throws IOException {

    if (!DiscoveryNode.nodeRequiresLocalStorage(settings)) {
      nodePaths = null;
      sharedDataPath = null;
      locks = null;
      nodeLockId = -1;
      nodeMetaData = new NodeMetaData(generateNodeId(settings));
      logger =
          Loggers.getLogger(
              getClass(), Node.addNodeNameIfNeeded(settings, this.nodeMetaData.nodeId()));
      return;
    }
    final NodePath[] nodePaths = new NodePath[environment.dataWithClusterFiles().length];
    final Lock[] locks = new Lock[nodePaths.length];
    boolean success = false;

    // trace logger to debug issues before the default node name is derived from the node id
    Logger startupTraceLogger = Loggers.getLogger(getClass(), settings);

    try {
      sharedDataPath = environment.sharedDataFile();
      int nodeLockId = -1;
      IOException lastException = null;
      int maxLocalStorageNodes = MAX_LOCAL_STORAGE_NODES_SETTING.get(settings);
      for (int possibleLockId = 0; possibleLockId < maxLocalStorageNodes; possibleLockId++) {
        for (int dirIndex = 0; dirIndex < environment.dataFiles().length; dirIndex++) {
          Path dataDirWithClusterName = environment.dataWithClusterFiles()[dirIndex];
          Path dataDir = environment.dataFiles()[dirIndex];
          Path dir = dataDir.resolve(NODES_FOLDER).resolve(Integer.toString(possibleLockId));
          Files.createDirectories(dir);

          try (Directory luceneDir = FSDirectory.open(dir, NativeFSLockFactory.INSTANCE)) {
            startupTraceLogger.trace("obtaining node lock on {} ...", dir.toAbsolutePath());
            try {
              locks[dirIndex] = luceneDir.obtainLock(NODE_LOCK_FILENAME);
              nodePaths[dirIndex] = new NodePath(dir);
              nodeLockId = possibleLockId;
            } catch (LockObtainFailedException ex) {
              startupTraceLogger.trace("failed to obtain node lock on {}", dir.toAbsolutePath());
              // release all the ones that were obtained up until now
              releaseAndNullLocks(locks);
              break;
            }

          } catch (IOException e) {
            startupTraceLogger.trace(
                (Supplier<?>)
                    () ->
                        new ParameterizedMessage(
                            "failed to obtain node lock on {}", dir.toAbsolutePath()),
                e);
            lastException = new IOException("failed to obtain lock on " + dir.toAbsolutePath(), e);
            // release all the ones that were obtained up until now
            releaseAndNullLocks(locks);
            break;
          }
        }
        if (locks[0] != null) {
          // we found a lock, break
          break;
        }
      }

      if (locks[0] == null) {
        final String message =
            String.format(
                Locale.ROOT,
                "failed to obtain node locks, tried [%s] with lock id%s;"
                    + " maybe these locations are not writable or multiple nodes were started without increasing [%s] (was [%d])?",
                Arrays.toString(environment.dataWithClusterFiles()),
                maxLocalStorageNodes == 1 ? " [0]" : "s [0--" + (maxLocalStorageNodes - 1) + "]",
                MAX_LOCAL_STORAGE_NODES_SETTING.getKey(),
                maxLocalStorageNodes);
        throw new IllegalStateException(message, lastException);
      }
      this.nodeMetaData = loadOrCreateNodeMetaData(settings, startupTraceLogger, nodePaths);
      this.logger =
          Loggers.getLogger(
              getClass(), Node.addNodeNameIfNeeded(settings, this.nodeMetaData.nodeId()));

      this.nodeLockId = nodeLockId;
      this.locks = locks;
      this.nodePaths = nodePaths;

      if (logger.isDebugEnabled()) {
        logger.debug("using node location [{}], local_lock_id [{}]", nodePaths, nodeLockId);
      }

      maybeLogPathDetails();
      maybeLogHeapDetails();

      applySegmentInfosTrace(settings);
      assertCanWrite();
      success = true;
    } finally {
      if (success == false) {
        IOUtils.closeWhileHandlingException(locks);
      }
    }
  }

  /** Returns true if the directory is empty */
  private static boolean dirEmpty(final Path path) throws IOException {
    try (DirectoryStream<Path> stream = Files.newDirectoryStream(path)) {
      return stream.iterator().hasNext() == false;
    }
  }

  private static void releaseAndNullLocks(Lock[] locks) {
    for (int i = 0; i < locks.length; i++) {
      if (locks[i] != null) {
        IOUtils.closeWhileHandlingException(locks[i]);
      }
      locks[i] = null;
    }
  }

  private void maybeLogPathDetails() throws IOException {

    // We do some I/O in here, so skip this if DEBUG/INFO are not enabled:
    if (logger.isDebugEnabled()) {
      // Log one line per path.data:
      StringBuilder sb = new StringBuilder();
      for (NodePath nodePath : nodePaths) {
        sb.append('\n').append(" -> ").append(nodePath.path.toAbsolutePath());

        String spinsDesc;
        if (nodePath.spins == null) {
          spinsDesc = "unknown";
        } else if (nodePath.spins) {
          spinsDesc = "possibly";
        } else {
          spinsDesc = "no";
        }

        FsInfo.Path fsPath = FsProbe.getFSInfo(nodePath);
        sb.append(", free_space [")
            .append(fsPath.getFree())
            .append("], usable_space [")
            .append(fsPath.getAvailable())
            .append("], total_space [")
            .append(fsPath.getTotal())
            .append("], spins? [")
            .append(spinsDesc)
            .append("], mount [")
            .append(fsPath.getMount())
            .append("], type [")
            .append(fsPath.getType())
            .append(']');
      }
      logger.debug("node data locations details:{}", sb);
    } else if (logger.isInfoEnabled()) {
      FsInfo.Path totFSPath = new FsInfo.Path();
      Set<String> allTypes = new HashSet<>();
      Set<String> allSpins = new HashSet<>();
      Set<String> allMounts = new HashSet<>();
      for (NodePath nodePath : nodePaths) {
        FsInfo.Path fsPath = FsProbe.getFSInfo(nodePath);
        String mount = fsPath.getMount();
        if (allMounts.contains(mount) == false) {
          allMounts.add(mount);
          String type = fsPath.getType();
          if (type != null) {
            allTypes.add(type);
          }
          Boolean spins = fsPath.getSpins();
          if (spins == null) {
            allSpins.add("unknown");
          } else if (spins.booleanValue()) {
            allSpins.add("possibly");
          } else {
            allSpins.add("no");
          }
          totFSPath.add(fsPath);
        }
      }

      // Just log a 1-line summary:
      logger.info(
          "using [{}] data paths, mounts [{}], net usable_space [{}], net total_space [{}], spins? [{}], types [{}]",
          nodePaths.length,
          allMounts,
          totFSPath.getAvailable(),
          totFSPath.getTotal(),
          toString(allSpins),
          toString(allTypes));
    }
  }

  private void maybeLogHeapDetails() {
    JvmInfo jvmInfo = JvmInfo.jvmInfo();
    ByteSizeValue maxHeapSize = jvmInfo.getMem().getHeapMax();
    String useCompressedOops = jvmInfo.useCompressedOops();
    logger.info(
        "heap size [{}], compressed ordinary object pointers [{}]", maxHeapSize, useCompressedOops);
  }

  /**
   * scans the node paths and loads existing metaData file. If not found a new meta data will be
   * generated and persisted into the nodePaths
   */
  private static NodeMetaData loadOrCreateNodeMetaData(
      Settings settings, Logger logger, NodePath... nodePaths) throws IOException {
    final Path[] paths = Arrays.stream(nodePaths).map(np -> np.path).toArray(Path[]::new);
    NodeMetaData metaData = NodeMetaData.FORMAT.loadLatestState(logger, paths);
    if (metaData == null) {
      metaData = new NodeMetaData(generateNodeId(settings));
    }
    // we write again to make sure all paths have the latest state file
    NodeMetaData.FORMAT.write(metaData, paths);
    return metaData;
  }

  public static String generateNodeId(Settings settings) {
    Random random = Randomness.get(settings, NODE_ID_SEED_SETTING);
    return UUIDs.randomBase64UUID(random);
  }

  @SuppressForbidden(reason = "System.out.*")
  static void applySegmentInfosTrace(Settings settings) {
    if (ENABLE_LUCENE_SEGMENT_INFOS_TRACE_SETTING.get(settings)) {
      SegmentInfos.setInfoStream(System.out);
    }
  }

  private static String toString(Collection<String> items) {
    StringBuilder b = new StringBuilder();
    for (String item : items) {
      if (b.length() > 0) {
        b.append(", ");
      }
      b.append(item);
    }
    return b.toString();
  }

  /**
   * Deletes a shard data directory iff the shards locks were successfully acquired.
   *
   * @param shardId the id of the shard to delete to delete
   * @throws IOException if an IOException occurs
   */
  public void deleteShardDirectorySafe(ShardId shardId, IndexSettings indexSettings)
      throws IOException, ShardLockObtainFailedException {
    final Path[] paths = availableShardPaths(shardId);
    logger.trace("deleting shard {} directory, paths: [{}]", shardId, paths);
    try (ShardLock lock = shardLock(shardId)) {
      deleteShardDirectoryUnderLock(lock, indexSettings);
    }
  }

  /**
   * Acquires, then releases, all {@code write.lock} files in the given shard paths. The
   * "write.lock" file is assumed to be under the shard path's "index" directory as used by
   * Elasticsearch.
   *
   * @throws LockObtainFailedException if any of the locks could not be acquired
   */
  public static void acquireFSLockForPaths(IndexSettings indexSettings, Path... shardPaths)
      throws IOException {
    Lock[] locks = new Lock[shardPaths.length];
    Directory[] dirs = new Directory[shardPaths.length];
    try {
      for (int i = 0; i < shardPaths.length; i++) {
        // resolve the directory the shard actually lives in
        Path p = shardPaths[i].resolve("index");
        // open a directory (will be immediately closed) on the shard's location
        dirs[i] =
            new SimpleFSDirectory(
                p, indexSettings.getValue(FsDirectoryService.INDEX_LOCK_FACTOR_SETTING));
        // create a lock for the "write.lock" file
        try {
          locks[i] = dirs[i].obtainLock(IndexWriter.WRITE_LOCK_NAME);
        } catch (IOException ex) {
          throw new LockObtainFailedException(
              "unable to acquire " + IndexWriter.WRITE_LOCK_NAME + " for " + p, ex);
        }
      }
    } finally {
      IOUtils.closeWhileHandlingException(locks);
      IOUtils.closeWhileHandlingException(dirs);
    }
  }

  /**
   * Deletes a shard data directory. Note: this method assumes that the shard lock is acquired. This
   * method will also attempt to acquire the write locks for the shard's paths before deleting the
   * data, but this is best effort, as the lock is released before the deletion happens in order to
   * allow the folder to be deleted
   *
   * @param lock the shards lock
   * @throws IOException if an IOException occurs
   * @throws ElasticsearchException if the write.lock is not acquirable
   */
  public void deleteShardDirectoryUnderLock(ShardLock lock, IndexSettings indexSettings)
      throws IOException {
    final ShardId shardId = lock.getShardId();
    assert isShardLocked(shardId) : "shard " + shardId + " is not locked";
    final Path[] paths = availableShardPaths(shardId);
    logger.trace("acquiring locks for {}, paths: [{}]", shardId, paths);
    acquireFSLockForPaths(indexSettings, paths);
    IOUtils.rm(paths);
    if (indexSettings.hasCustomDataPath()) {
      Path customLocation = resolveCustomLocation(indexSettings, shardId);
      logger.trace("acquiring lock for {}, custom path: [{}]", shardId, customLocation);
      acquireFSLockForPaths(indexSettings, customLocation);
      logger.trace("deleting custom shard {} directory [{}]", shardId, customLocation);
      IOUtils.rm(customLocation);
    }
    logger.trace("deleted shard {} directory, paths: [{}]", shardId, paths);
    assert FileSystemUtils.exists(paths) == false;
  }

  private boolean isShardLocked(ShardId id) {
    try {
      shardLock(id, 0).close();
      return false;
    } catch (ShardLockObtainFailedException ex) {
      return true;
    }
  }

  /**
   * Deletes an indexes data directory recursively iff all of the indexes shards locks were
   * successfully acquired. If any of the indexes shard directories can't be locked non of the
   * shards will be deleted
   *
   * @param index the index to delete
   * @param lockTimeoutMS how long to wait for acquiring the indices shard locks
   * @param indexSettings settings for the index being deleted
   * @throws IOException if any of the shards data directories can't be locked or deleted
   */
  public void deleteIndexDirectorySafe(Index index, long lockTimeoutMS, IndexSettings indexSettings)
      throws IOException, ShardLockObtainFailedException {
    final List<ShardLock> locks = lockAllForIndex(index, indexSettings, lockTimeoutMS);
    try {
      deleteIndexDirectoryUnderLock(index, indexSettings);
    } finally {
      IOUtils.closeWhileHandlingException(locks);
    }
  }

  /**
   * Deletes an indexes data directory recursively. Note: this method assumes that the shard lock is
   * acquired
   *
   * @param index the index to delete
   * @param indexSettings settings for the index being deleted
   */
  public void deleteIndexDirectoryUnderLock(Index index, IndexSettings indexSettings)
      throws IOException {
    final Path[] indexPaths = indexPaths(index);
    logger.trace(
        "deleting index {} directory, paths({}): [{}]", index, indexPaths.length, indexPaths);
    IOUtils.rm(indexPaths);
    if (indexSettings.hasCustomDataPath()) {
      Path customLocation = resolveIndexCustomLocation(indexSettings);
      logger.trace("deleting custom index {} directory [{}]", index, customLocation);
      IOUtils.rm(customLocation);
    }
  }

  /**
   * Tries to lock all local shards for the given index. If any of the shard locks can't be acquired
   * a {@link ShardLockObtainFailedException} is thrown and all previously acquired locks are
   * released.
   *
   * @param index the index to lock shards for
   * @param lockTimeoutMS how long to wait for acquiring the indices shard locks
   * @return the {@link ShardLock} instances for this index.
   * @throws IOException if an IOException occurs.
   */
  public List<ShardLock> lockAllForIndex(Index index, IndexSettings settings, long lockTimeoutMS)
      throws IOException, ShardLockObtainFailedException {
    final int numShards = settings.getNumberOfShards();
    if (numShards <= 0) {
      throw new IllegalArgumentException("settings must contain a non-null > 0 number of shards");
    }
    logger.trace("locking all shards for index {} - [{}]", index, numShards);
    List<ShardLock> allLocks = new ArrayList<>(numShards);
    boolean success = false;
    long startTimeNS = System.nanoTime();
    try {
      for (int i = 0; i < numShards; i++) {
        long timeoutLeftMS =
            Math.max(0, lockTimeoutMS - TimeValue.nsecToMSec((System.nanoTime() - startTimeNS)));
        allLocks.add(shardLock(new ShardId(index, i), timeoutLeftMS));
      }
      success = true;
    } finally {
      if (success == false) {
        logger.trace("unable to lock all shards for index {}", index);
        IOUtils.closeWhileHandlingException(allLocks);
      }
    }
    return allLocks;
  }

  /**
   * Tries to lock the given shards ID. A shard lock is required to perform any kind of write
   * operation on a shards data directory like deleting files, creating a new index writer or
   * recover from a different shard instance into it. If the shard lock can not be acquired a {@link
   * ShardLockObtainFailedException} is thrown.
   *
   * <p>Note: this method will return immediately if the lock can't be acquired.
   *
   * @param id the shard ID to lock
   * @return the shard lock. Call {@link ShardLock#close()} to release the lock
   */
  public ShardLock shardLock(ShardId id) throws ShardLockObtainFailedException {
    return shardLock(id, 0);
  }

  /**
   * Tries to lock the given shards ID. A shard lock is required to perform any kind of write
   * operation on a shards data directory like deleting files, creating a new index writer or
   * recover from a different shard instance into it. If the shard lock can not be acquired a {@link
   * ShardLockObtainFailedException} is thrown
   *
   * @param shardId the shard ID to lock
   * @param lockTimeoutMS the lock timeout in milliseconds
   * @return the shard lock. Call {@link ShardLock#close()} to release the lock
   */
  public ShardLock shardLock(final ShardId shardId, long lockTimeoutMS)
      throws ShardLockObtainFailedException {
    logger.trace("acquiring node shardlock on [{}], timeout [{}]", shardId, lockTimeoutMS);
    final InternalShardLock shardLock;
    final boolean acquired;
    synchronized (shardLocks) {
      if (shardLocks.containsKey(shardId)) {
        shardLock = shardLocks.get(shardId);
        shardLock.incWaitCount();
        acquired = false;
      } else {
        shardLock = new InternalShardLock(shardId);
        shardLocks.put(shardId, shardLock);
        acquired = true;
      }
    }
    if (acquired == false) {
      boolean success = false;
      try {
        shardLock.acquire(lockTimeoutMS);
        success = true;
      } finally {
        if (success == false) {
          shardLock.decWaitCount();
        }
      }
    }
    logger.trace("successfully acquired shardlock for [{}]", shardId);
    return new ShardLock(shardId) { // new instance prevents double closing
      @Override
      protected void closeInternal() {
        shardLock.release();
        logger.trace("released shard lock for [{}]", shardId);
      }
    };
  }

  /** A functional interface that people can use to reference {@link #shardLock(ShardId, long)} */
  @FunctionalInterface
  public interface ShardLocker {
    ShardLock lock(ShardId shardId, long lockTimeoutMS) throws ShardLockObtainFailedException;
  }

  /**
   * Returns all currently lock shards.
   *
   * <p>Note: the shard ids return do not contain a valid Index UUID
   */
  public Set<ShardId> lockedShards() {
    synchronized (shardLocks) {
      return unmodifiableSet(new HashSet<>(shardLocks.keySet()));
    }
  }

  private final class InternalShardLock {
    /*
     * This class holds a mutex for exclusive access and timeout / wait semantics
     * and a reference count to cleanup the shard lock instance form the internal data
     * structure if nobody is waiting for it. the wait count is guarded by the same lock
     * that is used to mutate the map holding the shard locks to ensure exclusive access
     */
    private final Semaphore mutex = new Semaphore(1);
    private int waitCount = 1; // guarded by shardLocks
    private final ShardId shardId;

    InternalShardLock(ShardId shardId) {
      this.shardId = shardId;
      mutex.acquireUninterruptibly();
    }

    protected void release() {
      mutex.release();
      decWaitCount();
    }

    void incWaitCount() {
      synchronized (shardLocks) {
        assert waitCount > 0 : "waitCount is " + waitCount + " but should be > 0";
        waitCount++;
      }
    }

    private void decWaitCount() {
      synchronized (shardLocks) {
        assert waitCount > 0 : "waitCount is " + waitCount + " but should be > 0";
        --waitCount;
        logger.trace("shard lock wait count for {} is now [{}]", shardId, waitCount);
        if (waitCount == 0) {
          logger.trace("last shard lock wait decremented, removing lock for {}", shardId);
          InternalShardLock remove = shardLocks.remove(shardId);
          assert remove != null : "Removed lock was null";
        }
      }
    }

    void acquire(long timeoutInMillis) throws ShardLockObtainFailedException {
      try {
        if (mutex.tryAcquire(timeoutInMillis, TimeUnit.MILLISECONDS) == false) {
          throw new ShardLockObtainFailedException(
              shardId, "obtaining shard lock timed out after " + timeoutInMillis + "ms");
        }
      } catch (InterruptedException e) {
        Thread.currentThread().interrupt();
        throw new ShardLockObtainFailedException(
            shardId, "thread interrupted while trying to obtain shard lock", e);
      }
    }
  }

  public boolean hasNodeFile() {
    return nodePaths != null && locks != null;
  }

  /**
   * Returns an array of all of the nodes data locations.
   *
   * @throws IllegalStateException if the node is not configured to store local locations
   */
  public Path[] nodeDataPaths() {
    assertEnvIsLocked();
    Path[] paths = new Path[nodePaths.length];
    for (int i = 0; i < paths.length; i++) {
      paths[i] = nodePaths[i].path;
    }
    return paths;
  }

  /**
   * returns the unique uuid describing this node. The uuid is persistent in the data folder of this
   * node and remains across restarts.
   */
  public String nodeId() {
    // we currently only return the ID and hide the underlying nodeMetaData implementation in order
    // to avoid
    // confusion with other "metadata" like node settings found in elasticsearch.yml. In future
    // we can encapsulate both (and more) in one NodeMetaData (or NodeSettings) object ala
    // IndexSettings
    return nodeMetaData.nodeId();
  }

  /** Returns an array of all of the {@link NodePath}s. */
  public NodePath[] nodePaths() {
    assertEnvIsLocked();
    if (nodePaths == null || locks == null) {
      throw new IllegalStateException("node is not configured to store local location");
    }
    return nodePaths;
  }

  /** Returns all index paths. */
  public Path[] indexPaths(Index index) {
    assertEnvIsLocked();
    Path[] indexPaths = new Path[nodePaths.length];
    for (int i = 0; i < nodePaths.length; i++) {
      indexPaths[i] = nodePaths[i].resolve(index);
    }
    return indexPaths;
  }

  /**
   * Returns all shard paths excluding custom shard path. Note: Shards are only allocated on one of
   * the returned paths. The returned array may contain paths to non-existing directories.
   *
   * @see IndexSettings#hasCustomDataPath()
   * @see #resolveCustomLocation(IndexSettings, ShardId)
   */
  public Path[] availableShardPaths(ShardId shardId) {
    assertEnvIsLocked();
    final NodePath[] nodePaths = nodePaths();
    final Path[] shardLocations = new Path[nodePaths.length];
    for (int i = 0; i < nodePaths.length; i++) {
      shardLocations[i] = nodePaths[i].resolve(shardId);
    }
    return shardLocations;
  }

  /** Returns all folder names in ${data.paths}/nodes/{node.id}/indices folder */
  public Set<String> availableIndexFolders() throws IOException {
    if (nodePaths == null || locks == null) {
      throw new IllegalStateException("node is not configured to store local location");
    }
    assertEnvIsLocked();
    Set<String> indexFolders = new HashSet<>();
    for (NodePath nodePath : nodePaths) {
      Path indicesLocation = nodePath.indicesPath;
      if (Files.isDirectory(indicesLocation)) {
        try (DirectoryStream<Path> stream = Files.newDirectoryStream(indicesLocation)) {
          for (Path index : stream) {
            if (Files.isDirectory(index)) {
              indexFolders.add(index.getFileName().toString());
            }
          }
        }
      }
    }
    return indexFolders;
  }

  /**
   * Resolves all existing paths to <code>indexFolderName</code> in
   * ${data.paths}/nodes/{node.id}/indices
   */
  public Path[] resolveIndexFolder(String indexFolderName) throws IOException {
    if (nodePaths == null || locks == null) {
      throw new IllegalStateException("node is not configured to store local location");
    }
    assertEnvIsLocked();
    List<Path> paths = new ArrayList<>(nodePaths.length);
    for (NodePath nodePath : nodePaths) {
      Path indexFolder = nodePath.indicesPath.resolve(indexFolderName);
      if (Files.exists(indexFolder)) {
        paths.add(indexFolder);
      }
    }
    return paths.toArray(new Path[paths.size()]);
  }

  /**
   * Tries to find all allocated shards for the given index on the current node. NOTE: This methods
   * is prone to race-conditions on the filesystem layer since it might not see directories created
   * concurrently or while it's traversing.
   *
   * @param index the index to filter shards
   * @return a set of shard IDs
   * @throws IOException if an IOException occurs
   */
  public Set<ShardId> findAllShardIds(final Index index) throws IOException {
    assert index != null;
    if (nodePaths == null || locks == null) {
      throw new IllegalStateException("node is not configured to store local location");
    }
    assertEnvIsLocked();
    final Set<ShardId> shardIds = new HashSet<>();
    final String indexUniquePathId = index.getUUID();
    for (final NodePath nodePath : nodePaths) {
      Path location = nodePath.indicesPath;
      if (Files.isDirectory(location)) {
        try (DirectoryStream<Path> indexStream = Files.newDirectoryStream(location)) {
          for (Path indexPath : indexStream) {
            if (indexUniquePathId.equals(indexPath.getFileName().toString())) {
              shardIds.addAll(findAllShardsForIndex(indexPath, index));
            }
          }
        }
      }
    }
    return shardIds;
  }

  private static Set<ShardId> findAllShardsForIndex(Path indexPath, Index index)
      throws IOException {
    assert indexPath.getFileName().toString().equals(index.getUUID());
    Set<ShardId> shardIds = new HashSet<>();
    if (Files.isDirectory(indexPath)) {
      try (DirectoryStream<Path> stream = Files.newDirectoryStream(indexPath)) {
        for (Path shardPath : stream) {
          String fileName = shardPath.getFileName().toString();
          if (Files.isDirectory(shardPath) && fileName.chars().allMatch(Character::isDigit)) {
            int shardId = Integer.parseInt(fileName);
            ShardId id = new ShardId(index, shardId);
            shardIds.add(id);
          }
        }
      }
    }
    return shardIds;
  }

  @Override
  public void close() {
    if (closed.compareAndSet(false, true) && locks != null) {
      for (Lock lock : locks) {
        try {
          logger.trace("releasing lock [{}]", lock);
          lock.close();
        } catch (IOException e) {
          logger.trace(
              (Supplier<?>) () -> new ParameterizedMessage("failed to release lock [{}]", lock), e);
        }
      }
    }
  }

  private void assertEnvIsLocked() {
    if (!closed.get() && locks != null) {
      for (Lock lock : locks) {
        try {
          lock.ensureValid();
        } catch (IOException e) {
          logger.warn("lock assertion failed", e);
          throw new IllegalStateException("environment is not locked", e);
        }
      }
    }
  }

  /**
   * This method tries to write an empty file and moves it using an atomic move operation. This
   * method throws an {@link IllegalStateException} if this operation is not supported by the
   * filesystem. This test is executed on each of the data directories. This method cleans up all
   * files even in the case of an error.
   */
  public void ensureAtomicMoveSupported() throws IOException {
    final NodePath[] nodePaths = nodePaths();
    for (NodePath nodePath : nodePaths) {
      assert Files.isDirectory(nodePath.path) : nodePath.path + " is not a directory";
      final Path src = nodePath.path.resolve("__es__.tmp");
      final Path target = nodePath.path.resolve("__es__.final");
      try {
        Files.createFile(src);
        Files.move(src, target, StandardCopyOption.ATOMIC_MOVE);
      } catch (AtomicMoveNotSupportedException ex) {
        throw new IllegalStateException(
            "atomic_move is not supported by the filesystem on path ["
                + nodePath.path
                + "] atomic_move is required for elasticsearch to work correctly.",
            ex);
      } finally {
        try {
          Files.deleteIfExists(src);
        } finally {
          Files.deleteIfExists(target);
        }
      }
    }
  }

  /**
   * Resolve the custom path for a index's shard. Uses the {@code IndexMetaData.SETTING_DATA_PATH}
   * setting to determine the root path for the index.
   *
   * @param indexSettings settings for the index
   */
  public Path resolveBaseCustomLocation(IndexSettings indexSettings) {
    String customDataDir = indexSettings.customDataPath();
    if (customDataDir != null) {
      // This assert is because this should be caught by MetaDataCreateIndexService
      assert sharedDataPath != null;
      if (ADD_NODE_LOCK_ID_TO_CUSTOM_PATH.get(indexSettings.getNodeSettings())) {
        return sharedDataPath.resolve(customDataDir).resolve(Integer.toString(this.nodeLockId));
      } else {
        return sharedDataPath.resolve(customDataDir);
      }
    } else {
      throw new IllegalArgumentException(
          "no custom " + IndexMetaData.SETTING_DATA_PATH + " setting available");
    }
  }

  /**
   * Resolve the custom path for a index's shard. Uses the {@code IndexMetaData.SETTING_DATA_PATH}
   * setting to determine the root path for the index.
   *
   * @param indexSettings settings for the index
   */
  private Path resolveIndexCustomLocation(IndexSettings indexSettings) {
    return resolveBaseCustomLocation(indexSettings).resolve(indexSettings.getUUID());
  }

  /**
   * Resolve the custom path for a index's shard. Uses the {@code IndexMetaData.SETTING_DATA_PATH}
   * setting to determine the root path for the index.
   *
   * @param indexSettings settings for the index
   * @param shardId shard to resolve the path to
   */
  public Path resolveCustomLocation(IndexSettings indexSettings, final ShardId shardId) {
    return resolveIndexCustomLocation(indexSettings).resolve(Integer.toString(shardId.id()));
  }

  /** Returns the {@code NodePath.path} for this shard. */
  public static Path shardStatePathToDataPath(Path shardPath) {
    int count = shardPath.getNameCount();

    // Sanity check:
    assert Integer.parseInt(shardPath.getName(count - 1).toString()) >= 0;
    assert "indices".equals(shardPath.getName(count - 3).toString());

    return shardPath.getParent().getParent().getParent();
  }

  /**
   * This is a best effort to ensure that we actually have write permissions to write in all our
   * data directories. This prevents disasters if nodes are started under the wrong username etc.
   */
  private void assertCanWrite() throws IOException {
    for (Path path : nodeDataPaths()) { // check node-paths are writable
      tryWriteTempFile(path);
    }
    for (String indexFolderName : this.availableIndexFolders()) {
      for (Path indexPath :
          this.resolveIndexFolder(indexFolderName)) { // check index paths are writable
        Path indexStatePath = indexPath.resolve(MetaDataStateFormat.STATE_DIR_NAME);
        tryWriteTempFile(indexStatePath);
        tryWriteTempFile(indexPath);
        try (DirectoryStream<Path> stream = Files.newDirectoryStream(indexPath)) {
          for (Path shardPath : stream) {
            String fileName = shardPath.getFileName().toString();
            if (Files.isDirectory(shardPath) && fileName.chars().allMatch(Character::isDigit)) {
              Path indexDir = shardPath.resolve(ShardPath.INDEX_FOLDER_NAME);
              Path statePath = shardPath.resolve(MetaDataStateFormat.STATE_DIR_NAME);
              Path translogDir = shardPath.resolve(ShardPath.TRANSLOG_FOLDER_NAME);
              tryWriteTempFile(indexDir);
              tryWriteTempFile(translogDir);
              tryWriteTempFile(statePath);
              tryWriteTempFile(shardPath);
            }
          }
        }
      }
    }
  }

  private static void tryWriteTempFile(Path path) throws IOException {
    if (Files.exists(path)) {
      Path resolve = path.resolve(".es_temp_file");
      try {
        Files.createFile(resolve);
        Files.deleteIfExists(resolve);
      } catch (IOException ex) {
        throw new IOException(
            "failed to write in data directory [" + path + "] write permission is required", ex);
      }
    }
  }
}
public class ElectMasterService extends AbstractComponent {

  public static final Setting<Integer> DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING =
      Setting.intSetting("discovery.zen.minimum_master_nodes", -1, true, Setting.Scope.CLUSTER);

  // This is the minimum version a master needs to be on, otherwise it gets ignored
  // This is based on the minimum compatible version of the current version this node is on
  private final Version minMasterVersion;
  private final NodeComparator nodeComparator = new NodeComparator();

  private volatile int minimumMasterNodes;

  @Inject
  public ElectMasterService(Settings settings, Version version) {
    super(settings);
    this.minMasterVersion = version.minimumCompatibilityVersion();
    this.minimumMasterNodes = DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.get(settings);
    logger.debug("using minimum_master_nodes [{}]", minimumMasterNodes);
  }

  public void minimumMasterNodes(int minimumMasterNodes) {
    this.minimumMasterNodes = minimumMasterNodes;
  }

  public int minimumMasterNodes() {
    return minimumMasterNodes;
  }

  public boolean hasEnoughMasterNodes(Iterable<DiscoveryNode> nodes) {
    if (minimumMasterNodes < 1) {
      return true;
    }
    int count = 0;
    for (DiscoveryNode node : nodes) {
      if (node.masterNode()) {
        count++;
      }
    }
    return count >= minimumMasterNodes;
  }

  /**
   * Returns the given nodes sorted by likelyhood of being elected as master, most likely first.
   * Non-master nodes are not removed but are rather put in the end
   */
  public List<DiscoveryNode> sortByMasterLikelihood(Iterable<DiscoveryNode> nodes) {
    ArrayList<DiscoveryNode> sortedNodes = CollectionUtils.iterableAsArrayList(nodes);
    CollectionUtil.introSort(sortedNodes, nodeComparator);
    return sortedNodes;
  }

  /** Returns a list of the next possible masters. */
  public DiscoveryNode[] nextPossibleMasters(
      ObjectContainer<DiscoveryNode> nodes, int numberOfPossibleMasters) {
    List<DiscoveryNode> sortedNodes =
        sortedMasterNodes(Arrays.asList(nodes.toArray(DiscoveryNode.class)));
    if (sortedNodes == null) {
      return new DiscoveryNode[0];
    }
    List<DiscoveryNode> nextPossibleMasters = new ArrayList<>(numberOfPossibleMasters);
    int counter = 0;
    for (DiscoveryNode nextPossibleMaster : sortedNodes) {
      if (++counter >= numberOfPossibleMasters) {
        break;
      }
      nextPossibleMasters.add(nextPossibleMaster);
    }
    return nextPossibleMasters.toArray(new DiscoveryNode[nextPossibleMasters.size()]);
  }

  /**
   * Elects a new master out of the possible nodes, returning it. Returns <tt>null</tt> if no master
   * has been elected.
   */
  public DiscoveryNode electMaster(Iterable<DiscoveryNode> nodes) {
    List<DiscoveryNode> sortedNodes = sortedMasterNodes(nodes);
    if (sortedNodes == null || sortedNodes.isEmpty()) {
      return null;
    }
    DiscoveryNode masterNode = sortedNodes.get(0);
    // Sanity check: maybe we don't end up here, because serialization may have failed.
    if (masterNode.getVersion().before(minMasterVersion)) {
      logger.warn(
          "ignoring master [{}], because the version [{}] is lower than the minimum compatible version [{}]",
          masterNode,
          masterNode.getVersion(),
          minMasterVersion);
      return null;
    } else {
      return masterNode;
    }
  }

  private List<DiscoveryNode> sortedMasterNodes(Iterable<DiscoveryNode> nodes) {
    List<DiscoveryNode> possibleNodes = CollectionUtils.iterableAsArrayList(nodes);
    if (possibleNodes.isEmpty()) {
      return null;
    }
    // clean non master nodes
    for (Iterator<DiscoveryNode> it = possibleNodes.iterator(); it.hasNext(); ) {
      DiscoveryNode node = it.next();
      if (!node.masterNode()) {
        it.remove();
      }
    }
    CollectionUtil.introSort(possibleNodes, nodeComparator);
    return possibleNodes;
  }

  private static class NodeComparator implements Comparator<DiscoveryNode> {

    @Override
    public int compare(DiscoveryNode o1, DiscoveryNode o2) {
      if (o1.masterNode() && !o2.masterNode()) {
        return -1;
      }
      if (!o1.masterNode() && o2.masterNode()) {
        return 1;
      }
      return o1.id().compareTo(o2.id());
    }
  }
}
public class IndicesQueryCache extends AbstractComponent implements QueryCache, Closeable {

  public static final Setting<ByteSizeValue> INDICES_CACHE_QUERY_SIZE_SETTING =
      Setting.byteSizeSetting("indices.queries.cache.size", "10%", false, Scope.CLUSTER);
  public static final Setting<Integer> INDICES_CACHE_QUERY_COUNT_SETTING =
      Setting.intSetting("indices.queries.cache.count", 10000, 1, false, Scope.CLUSTER);

  private final LRUQueryCache cache;
  private final ShardCoreKeyMap shardKeyMap = new ShardCoreKeyMap();
  private final Map<ShardId, Stats> shardStats = new ConcurrentHashMap<>();
  private volatile long sharedRamBytesUsed;

  // This is a hack for the fact that the close listener for the
  // ShardCoreKeyMap will be called before onDocIdSetEviction
  // See onDocIdSetEviction for more info
  private final Map<Object, StatsAndCount> stats2 = new IdentityHashMap<>();

  public IndicesQueryCache(Settings settings) {
    super(settings);
    final ByteSizeValue size = INDICES_CACHE_QUERY_SIZE_SETTING.get(settings);
    final int count = INDICES_CACHE_QUERY_COUNT_SETTING.get(settings);
    logger.debug("using [node] query cache with size [{}] max filter count [{}]", size, count);
    cache =
        new LRUQueryCache(count, size.bytes()) {

          private Stats getStats(Object coreKey) {
            final ShardId shardId = shardKeyMap.getShardId(coreKey);
            if (shardId == null) {
              return null;
            }
            return shardStats.get(shardId);
          }

          private Stats getOrCreateStats(Object coreKey) {
            final ShardId shardId = shardKeyMap.getShardId(coreKey);
            Stats stats = shardStats.get(shardId);
            if (stats == null) {
              stats = new Stats();
              shardStats.put(shardId, stats);
            }
            return stats;
          }

          // It's ok to not protect these callbacks by a lock since it is
          // done in LRUQueryCache
          @Override
          protected void onClear() {
            assert Thread.holdsLock(this);
            super.onClear();
            for (Stats stats : shardStats.values()) {
              // don't throw away hit/miss
              stats.cacheSize = 0;
              stats.ramBytesUsed = 0;
            }
            sharedRamBytesUsed = 0;
          }

          @Override
          protected void onQueryCache(Query filter, long ramBytesUsed) {
            assert Thread.holdsLock(this);
            super.onQueryCache(filter, ramBytesUsed);
            sharedRamBytesUsed += ramBytesUsed;
          }

          @Override
          protected void onQueryEviction(Query filter, long ramBytesUsed) {
            assert Thread.holdsLock(this);
            super.onQueryEviction(filter, ramBytesUsed);
            sharedRamBytesUsed -= ramBytesUsed;
          }

          @Override
          protected void onDocIdSetCache(Object readerCoreKey, long ramBytesUsed) {
            assert Thread.holdsLock(this);
            super.onDocIdSetCache(readerCoreKey, ramBytesUsed);
            final Stats shardStats = getOrCreateStats(readerCoreKey);
            shardStats.cacheSize += 1;
            shardStats.cacheCount += 1;
            shardStats.ramBytesUsed += ramBytesUsed;

            StatsAndCount statsAndCount = stats2.get(readerCoreKey);
            if (statsAndCount == null) {
              statsAndCount = new StatsAndCount(shardStats);
              stats2.put(readerCoreKey, statsAndCount);
            }
            statsAndCount.count += 1;
          }

          @Override
          protected void onDocIdSetEviction(
              Object readerCoreKey, int numEntries, long sumRamBytesUsed) {
            assert Thread.holdsLock(this);
            super.onDocIdSetEviction(readerCoreKey, numEntries, sumRamBytesUsed);
            // onDocIdSetEviction might sometimes be called with a number
            // of entries equal to zero if the cache for the given segment
            // was already empty when the close listener was called
            if (numEntries > 0) {
              // We can't use ShardCoreKeyMap here because its core closed
              // listener is called before the listener of the cache which
              // triggers this eviction. So instead we use use stats2 that
              // we only evict when nothing is cached anymore on the segment
              // instead of relying on close listeners
              final StatsAndCount statsAndCount = stats2.get(readerCoreKey);
              final Stats shardStats = statsAndCount.stats;
              shardStats.cacheSize -= numEntries;
              shardStats.ramBytesUsed -= sumRamBytesUsed;
              statsAndCount.count -= numEntries;
              if (statsAndCount.count == 0) {
                stats2.remove(readerCoreKey);
              }
            }
          }

          @Override
          protected void onHit(Object readerCoreKey, Query filter) {
            assert Thread.holdsLock(this);
            super.onHit(readerCoreKey, filter);
            final Stats shardStats = getStats(readerCoreKey);
            shardStats.hitCount += 1;
          }

          @Override
          protected void onMiss(Object readerCoreKey, Query filter) {
            assert Thread.holdsLock(this);
            super.onMiss(readerCoreKey, filter);
            final Stats shardStats = getOrCreateStats(readerCoreKey);
            shardStats.missCount += 1;
          }
        };
    sharedRamBytesUsed = 0;
  }

  /** Get usage statistics for the given shard. */
  public QueryCacheStats getStats(ShardId shard) {
    final Map<ShardId, QueryCacheStats> stats = new HashMap<>();
    for (Map.Entry<ShardId, Stats> entry : shardStats.entrySet()) {
      stats.put(entry.getKey(), entry.getValue().toQueryCacheStats());
    }
    QueryCacheStats shardStats = new QueryCacheStats();
    QueryCacheStats info = stats.get(shard);
    if (info == null) {
      info = new QueryCacheStats();
    }
    shardStats.add(info);

    // We also have some shared ram usage that we try to distribute to
    // proportionally to their number of cache entries of each shard
    long totalSize = 0;
    for (QueryCacheStats s : stats.values()) {
      totalSize += s.getCacheSize();
    }
    final double weight =
        totalSize == 0 ? 1d / stats.size() : shardStats.getCacheSize() / totalSize;
    final long additionalRamBytesUsed = Math.round(weight * sharedRamBytesUsed);
    shardStats.add(new QueryCacheStats(additionalRamBytesUsed, 0, 0, 0, 0));
    return shardStats;
  }

  @Override
  public Weight doCache(Weight weight, QueryCachingPolicy policy) {
    while (weight instanceof CachingWeightWrapper) {
      weight = ((CachingWeightWrapper) weight).in;
    }
    final Weight in = cache.doCache(weight, policy);
    // We wrap the weight to track the readers it sees and map them with
    // the shards they belong to
    return new CachingWeightWrapper(in);
  }

  private class CachingWeightWrapper extends Weight {

    private final Weight in;

    protected CachingWeightWrapper(Weight in) {
      super(in.getQuery());
      this.in = in;
    }

    @Override
    public void extractTerms(Set<Term> terms) {
      in.extractTerms(terms);
    }

    @Override
    public Explanation explain(LeafReaderContext context, int doc) throws IOException {
      shardKeyMap.add(context.reader());
      return in.explain(context, doc);
    }

    @Override
    public float getValueForNormalization() throws IOException {
      return in.getValueForNormalization();
    }

    @Override
    public void normalize(float norm, float topLevelBoost) {
      in.normalize(norm, topLevelBoost);
    }

    @Override
    public Scorer scorer(LeafReaderContext context) throws IOException {
      shardKeyMap.add(context.reader());
      return in.scorer(context);
    }

    @Override
    public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
      shardKeyMap.add(context.reader());
      return in.bulkScorer(context);
    }
  }

  /** Clear all entries that belong to the given index. */
  public void clearIndex(String index) {
    final Set<Object> coreCacheKeys = shardKeyMap.getCoreKeysForIndex(index);
    for (Object coreKey : coreCacheKeys) {
      cache.clearCoreCacheKey(coreKey);
    }

    // This cache stores two things: filters, and doc id sets. Calling
    // clear only removes the doc id sets, but if we reach the situation
    // that the cache does not contain any DocIdSet anymore, then it
    // probably means that the user wanted to remove everything.
    if (cache.getCacheSize() == 0) {
      cache.clear();
    }
  }

  @Override
  public void close() {
    assert shardKeyMap.size() == 0 : shardKeyMap.size();
    assert shardStats.isEmpty() : shardStats.keySet();
    assert stats2.isEmpty() : stats2;
    cache.clear();
  }

  private static class Stats implements Cloneable {

    volatile long ramBytesUsed;
    volatile long hitCount;
    volatile long missCount;
    volatile long cacheCount;
    volatile long cacheSize;

    QueryCacheStats toQueryCacheStats() {
      return new QueryCacheStats(ramBytesUsed, hitCount, missCount, cacheCount, cacheSize);
    }
  }

  private static class StatsAndCount {
    int count;
    final Stats stats;

    StatsAndCount(Stats stats) {
      this.stats = stats;
      this.count = 0;
    }
  }

  private boolean empty(Stats stats) {
    if (stats == null) {
      return true;
    }
    return stats.cacheSize == 0 && stats.ramBytesUsed == 0;
  }

  public void onClose(ShardId shardId) {
    assert empty(shardStats.get(shardId));
    shardStats.remove(shardId);
  }
}